# Importing Packages

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from copy import deepcopy
from IPython.display import clear_output
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Importing dataset

In [None]:
data=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
data=data[['title','genres','overview']]

In [None]:
fortopicdf = data.copy()

In [None]:
fortopicdf['overview']=fortopicdf['overview'].fillna('')

In [None]:
fortopicdf

# Having the first look at the data

In [None]:
data.head()

# Making Genres Neat

In [None]:
def make_it_neat(data,name):
    arr=data[name].values
    pp=[]
    for i in arr:
        x=i
        x=x.replace('{','').replace('"id"','').replace('"name"','').replace('}','').replace('[','').replace(']','').replace(':','').replace('"','')
        p=[]
        for i in x.split(','):
            if i[2:].isdigit():
                continue
            else:
                p.append(i[2:])
        pp.append(p)
    data[name]=pp

In [None]:
make_it_neat(data,'genres')

In [None]:
data.head()

In [None]:
data['genres'].values[0]

## Looks better right :)

# Let's lemmatize the description now

In [None]:
# Filling all the empty/nan description rows with empty string
data['overview']=data['overview'].fillna('')

In [None]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x


In [None]:
rem_stopwords_tokenize(data,'overview')

In [None]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr=data[name]
    a=[]
    for i in arr:
        b=[]
        for j in i:
            x=lemmatizer.lemmatize(j,pos='a')
            x=lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name]=a

In [None]:
lemmatize_all(data,'overview')

# Let's have a look at the new updated data

In [None]:
data.head()

# We need to vectorize genres and overview now 

In [None]:
dic_genres={}
dic_overview={}
for i in data.genres:
    for j in i:
        if j not in dic_genres:
            dic_genres[j]=0
        else:
            continue
for i in data.overview:
    for j in i:
        if j not in dic_overview:
            dic_overview[j]=0
        else:
            continue

# Let's copy dataframe to new dataframe

In [None]:
df=deepcopy(data)

In [None]:
df.head()

# Function to convert word to vector

In [None]:
# This function is made to convert words to vector
def vectorizer(data,name,d):
    arr=data[name].values
    pp=[]
    count=0
    l=len(df)
    for i in arr:
        count+=1
        clear_output(wait=True)
        print('The progress is:','{:.2f}'.format(count*100/l),' %')
        dic=deepcopy(d)
        p=[]
        for j in i:
              dic[j]+=1
        p=list(dic.values())
        pp.append(p)
    data[name]=pp
            
            
    
    

In [None]:
vectorizer(df,'overview',dic_overview)

In [None]:
vectorizer(df,'genres',dic_genres)

In [None]:
df.head()

In [None]:
dic_overview

In [None]:
dic_genres

# Training model for genres predictions :)

In [None]:
X=df.overview.values

In [None]:
X=list(X)

In [None]:
y=df.genres.values

In [None]:
y=list(y)

## Ok guys since we have a lot of data we run out of ram :( 

## So we gonna work on smaller data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
clf = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_train, y_train)

# Let's try to predict the genre now :)

In [None]:
data.head()

In [None]:
x=data[data['title']=='The Dark Knight Rises']

In [None]:
over=x.overview

In [None]:
for i in list(over.values)[0]:
    dic_overview[i]+=1

In [None]:
X=list(dic_overview.values())

In [None]:
# Let's predict the genre
ans=clf.predict([X])

In [None]:
ans=list(list(ans)[0])

In [None]:
print('The genres for the movie The dark knight rises are:')
print()
for i in range(len(ans)):
    if ans[i]==1:
        print(list(dic_genres.keys())[i])

Top modeling

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

experimental code

In [None]:
movie = fortopicdf.loc[3]
display(movie)

In [None]:
overview = movie['overview']
print(overview)

In [None]:
from gensim.utils import simple_preprocess

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer('english')
nltk.download('wordnet')

In [None]:
# Lemmatization
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')


# Stemming
def stemming(text):
    return stemmer.stem(text)


# Tokenization
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            LemmatizedToken = lemmatize(token)
            result.append(stemming(LemmatizedToken))
    return result

In [None]:
movie = fortopicdf.loc[3]
display(movie)

In [None]:
overview = movie['overview']
print(overview)

In [None]:
print(preprocess(overview))

experiment ends here

In [None]:
processedMovies = fortopicdf['overview'].map(preprocess)
display(processedMovies)

In [None]:
#Bag of words

dictionary = gensim.corpora.Dictionary(processedMovies)

In [None]:
dictionary.filter_extremes(no_below=10, no_above=0.5,keep_n=100000)

In [None]:
bowCorpus = [dictionary.doc2bow(doc) for doc in processedMovies]

In [None]:
display(bowCorpus[3])

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bowCorpus)
tfidfCorpus = tfidf[bowCorpus]

In [None]:
display(tfidfCorpus[3])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from gensim.matutils import corpus2dense

In [None]:
tfidfDense = corpus2dense(tfidfCorpus, num_terms=100000, num_docs=len(tfidfCorpus))
tfidfDense = tfidfDense.T

In [None]:
print('movies, attributes:', tfidfDense.shape)

In [None]:
fortopicdf

In [None]:
denseMatrix, yCategory = [], []
for index, row in fortopicdf.iterrows():
    for category in row['genres']:
        denseMatrix.append(tfidfDense[index])
        yCategory.append(category['name'])