In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#pip install install ipython==7.17.0

In [None]:
import json

def loadMovies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df


def loadCredits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [None]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
import os

# Listing files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dfMovies = loadMovies("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")

In [None]:
# Filtering columns
columnsOfInterest = ['id', 'title', 'genres', 'overview']
dfMovies = dfMovies[columnsOfInterest]

In [None]:
# Rem\oving null(overview)
dfMovies = dfMovies[dfMovies['overview'].notnull()].reset_index(drop=True)

In [None]:
# Print
display(dfMovies.head(3))

# **Genres**

In [None]:
categoriesCount = {}

for index, row in dfMovies.iterrows():
    for category in row['genres']:
        catName = category['name']
        categoriesCount[catName] = categoriesCount[catName]+1 if (catName in categoriesCount) else 1

In [None]:
print('number of categories:', len(categoriesCount.keys()))

In [None]:
# Plotting
keys = categoriesCount.keys()
values = categoriesCount.values()

plt.bar(keys, values)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
values

In [None]:
gen_df=pd.DataFrame({'keys':pd.Series(keys),'values':pd.Series(values)})

In [None]:
gen_df

In [None]:
lis=[]
for i in gen_df['keys']:
    lis.append(i)

for k in range(0,len(lis)):
    lis[k]=str(lis[k]).strip()
    
from collections import Counter
genre_count = Counter(lis)

from wordcloud import WordCloud
wc = WordCloud(background_color='white')
wc.generate_from_frequencies(genre_count)
plt.figure(figsize=(20,10))
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

# **Tokenization,lemmatization,stemming**

In [None]:
import gensim # topic modeling toolkit
import nltk # natural language toolkit

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

stemmer = SnowballStemmer('english')
nltk.download('wordnet')

In [None]:
# Lemmatization process
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')


# Stemming process
def stemming(text):
    return stemmer.stem(text)


# Tokenization process
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            LemmatizedToken = lemmatize(token)
            result.append(stemming(LemmatizedToken))
    return result

In [None]:
movie = dfMovies.loc[3]
display(movie)

In [None]:
overview = movie['overview']
print(overview)

In [None]:
print(preprocess(overview))

In [None]:
processedMovies = dfMovies['overview'].map(preprocess)
display(processedMovies)

# **Bag of words**
representation used in natural language processing to transform the document into number vectors

In [None]:
dictionary = gensim.corpora.Dictionary(processedMovies)

In [None]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)

In [None]:
bowCorpus = [dictionary.doc2bow(doc) for doc in processedMovies]

In [None]:
display(bowCorpus[1])

# **Term frequency-inverse document frequency (TF-IDF)**
Represent the characteristics from movies, similar to bag-of-words.
TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bowCorpus)
tfidfCorpus = tfidf[bowCorpus]

In [None]:
# importance of each word in the topic
display(tfidfCorpus[3])

# **Presume categories**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from gensim.matutils import corpus2dense

In [None]:
#Creating a dense matrix, that is transform the words into a matrix of attributes.
tfidfDense = corpus2dense(tfidfCorpus, num_terms=100000, num_docs=len(tfidfCorpus))
tfidfDense = tfidfDense.T

In [None]:
print('movies, attributes:', tfidfDense.shape)

In [None]:
denseMatrix, yCategory = [], []
for index, row in dfMovies.iterrows():
    for category in row['genres']:
        denseMatrix.append(tfidfDense[index])
        yCategory.append(category['name'])

In [None]:
denseMatrix = [tup.astype(np.float16) for tup in denseMatrix]

In [None]:
#denseMatrix

In [None]:
#denseMatrix

In [None]:
#del tfidfDense

# **Training**

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(denseMatrix, yCategory, test_size=0.1, random_state=43)
# print('train size:', len(y_train))
# print('test size :', len(y_test))

In [None]:
def getClassesProbabilities(model, record):
    probs = model.predict_proba([record]).T
    classes = model.classes_

    output = pd.DataFrame(data=[classes,probs]).T
    output.columns = ['category','probability']
    output['probability'] = output.apply(lambda x: x['probability'][0], axis=1)
    return output.sort_values(by='probability', ascending=False)

In [None]:
# Train model
#clf = LogisticRegression(random_state=43, max_iter=100).fit(X_train, y_train)

In [None]:
#from joblib import dump, load

# Save model
#dump(clf, 'logisticRegression.model')

In [None]:
#clf = load('logisticRegression.model') 

In [None]:
#clf.predict([X_test[1]])

In [None]:
#probabilty of movies present
#getClassesProbabilities(clf, X_test[1])

# Topic Modeling
Topic modeling is a type of statistical modeling for discovering the abstract "topics" that occur in a set of documents. Latent Dirichlet Allocation (LDA) model is an algorithm used to classify text in a document to a particular topic. A topic is represented by a set of most representative words (common words) that appear in a collection of documents.

In [None]:
#bag of words
ldaBow = gensim.models.LdaMulticore(bowCorpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in ldaBow.print_topics(num_words=5):
    print('Topic: {} Words: {}'.format(idx, topic))

In [None]:
num_topics = 19
topic_words = []
for i in range(num_topics):
    tt = ldaBow.get_topic_terms(i,10)
    topic_words.append([dictionary[pair[0]] for pair in tt])

In [None]:
print(topic_words[15])

In [None]:
#pd.DataFrame(ldaBow.get_document_topics(denseMatrix))

In [None]:
bag_ofwords=ldaBow.get_document_topics(bowCorpus, minimum_probability=0.0)

In [None]:
bag_ofwords

In [None]:
#mlb.classes_

In [None]:
my_dict = {"id":[],"t":[]};
for idx, topic in ldaBow.print_topics(num_words=5):
    #x,y = ('Topic: {} Words: {}'.format(idx, topic))
    my_dict["id"].append(idx)
    my_dict["t"].append(topic)

In [None]:
for i,topic in ldaBow.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+"topic-"+ topic)
    print()

In [None]:
ldaTfidf = gensim.models.LdaMulticore(tfidfCorpus, num_topics=10, id2word=dictionary, passes=2, workers=4)

 5 most representative words for each abstract topic.

In [None]:
my_dict

In [None]:
ldaBow[Corpus[0]]

Predicting Topic using LDA TF-IDF model

In [None]:
document ="In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization."

In [None]:
tokens = preprocess(document)

In [None]:
topics = ldaBow.show_topics(formatted=True, num_topics=num_topics, num_words=20)

In [None]:
pd.DataFrame([(el[0], round(el[1],2), topics[el[0]][1]) for el in ldaBow[dictionary.doc2bow(tokens)]], columns=['topic #', 'weight', 'words in topic'])

In [None]:
topics = [ldaBow[tfidfCorpus[i]] for i in range(len(dfMovies))]

In [None]:
def topics_document_to_dataframe(topics_document, num_topics):
    res = pd.DataFrame(columns=range(num_topics))
    for topic_weight in topics_document:
        res.loc[0, topic_weight[0]] = topic_weight[1]
    return res
topics_document_to_dataframe([(9, 0.03853655432967504), (15, 0.09130117862212643), (18, 0.8692868808484044)], 20)

In [None]:
# Like TF-IDF, create a matrix of topic weighting, with documents as rows and topics as columns
document_topic = \
pd.concat([topics_document_to_dataframe(topics_document, num_topics=num_topics) for topics_document in topics]) \
  .reset_index(drop=True).fillna(0)

In [None]:
document_topic.head()

In [None]:
print(dfMovies.overview.loc[15][:1000])

In [None]:
#Looking at the distribution of topics in all documents¶
#Visualization of the proportion of topics in the documents (Documents are rows, topic are columns)
%matplotlib inline
import seaborn as sns; sns.set(rc={'figure.figsize':(10,20)})
sns.heatmap(document_topic.loc[document_topic.idxmax(axis=1).sort_values().index])

In [None]:

sns.set(rc={'figure.figsize':(10,5)})
document_topic.idxmax(axis=1).value_counts().plot.bar(color='lightblue')

In [None]:
# #vis = pyLDAvis.gensim.prepare(topic_model=ldaBow, corpus=tfidfCorpus, dictionary=dictionary)
# import pyLDAvis
# import pyLDAvis.gensim_models as gensimvis
# pyLDAvis.enable_notebook()

# # feed the LDA model into the pyLDAvis instance
# lda_viz = gensimvis.prepare(topic_model=ldaBow, corpus=tfidfCorpus, dictionary=dictionary)

In [None]:
document_topic.sort_values(14, ascending=False)[14].head(20)

In [None]:
print(dfMovies.overview.loc[0][:500])

In [None]:
dfx=pd.DataFrame.from_dict(my_dict)
dfx
#topicx=pd.DataFrame({'my_dict':pd.Series(keys)})

In [None]:
#topic_model=ldaBow, corpus=tfidfCorpus, dictionary=dictionary

In [None]:
for index, score in sorted(ldaTfidf[tfidfCorpus[3]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t Topic {}: {}".format(score, index, ldaTfidf.print_topic(index, 5)))

In [None]:
# Unseen movie using LDA BOW model
newMovie = "James Bond receives an obscure message from M about a sinister organisation, 'SPECTRE'. With the help of Madeleine, he uncovers the conspiracy, only to face an ugly truth."
print(newMovie)

In [None]:
bowVector = dictionary.doc2bow(preprocess(newMovie))

for index, score in sorted(ldaBow[bowVector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic {}: {}".format(score, index, ldaBow.print_topic(index, 5)))