## Problem statement 
Identify the movie genre/topic based on the movie tagline and description using Latent Dirichlete Allocation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation as LDA

%matplotlib inline 

In [None]:
movies = pd.read_csv("../input/tmdb_5000_movies.csv")

In [None]:
movies['allWords'] = movies.overview + " " + movies.tagline
print(movies.loc[0, 'allWords'])

In [None]:
movies = movies[~(movies.overview.isnull()) & ~(movies.tagline.isnull())][['title', 'allWords']]
movies['movie_id'] = movies.index
movies.shape

In [None]:
additional_stop_words = {'production', 'movie', 'movies', 'film', 'films', 'man', 'story'}
my_stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)
tfidf_vect = text.CountVectorizer(max_df=0.8, stop_words=my_stop_words)
tfidf_movies = tfidf_vect.fit_transform(movies['allWords'])
tfidf_feature_names = tfidf_vect.get_feature_names()

In [None]:
num_topics = 5

lda = LDA(n_components=num_topics, learning_method='online').fit(tfidf_movies)

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print("Topic", topic_idx, ":")
    print(" ".join([tfidf_feature_names[i] for i in topic.argsort()[:-14:-1]]))

In [None]:
doc_topic = lda.transform(tfidf_movies)
for n in range(10):
    curr_topic = doc_topic[n].argmax()
    print("the movie ", movies.loc[n, 'title'], ' belongs to topic', curr_topic)

reference: thank you Aneesha Bakharia for the [LDA overview](https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730) and how to implement it in SKlearn