In [929]:
import re
import string
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.decomposition import PCA, NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances

from tabulate import tabulate
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [108]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prathaprajaraman/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [788]:
df = pd.read_csv('data/tmdb_data.csv',lineterminator='\n')

In [789]:
df.head()

Unnamed: 0,id,title,overview,original_language,budget,revenue,release_date,vote_average,vote_count,director,lead_actor_1,lead_actor_2,lead_actor_3,lead_actor_4
0,2.0,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,fi,0.0,0.0,1988-10-21,6.9,150.0,Aki Kaurismäki,Turo Pajala,Susanna Haavisto,Matti Pellonpää,Eetu Hilkamo
1,3.0,Shadows in Paradise,"An episode in the life of Nikander, a garbage ...",fi,0.0,0.0,1986-10-17,7.2,149.0,Aki Kaurismäki,Matti Pellonpää,Kati Outinen,Sakari Kuosmanen,Esko Nikkari
2,5.0,Four Rooms,It's Ted the Bellhop's first night on the job....,en,4000000.0,4257354.0,1995-12-09,5.7,2037.0,Allison Anders,Tim Roth,Jennifer Beals,Antonio Banderas,Valeria Golino
3,6.0,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",en,21000000.0,12136938.0,1993-10-15,6.5,215.0,Stephen Hopkins,Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Stephen Dorff
4,11.0,Star Wars,Princess Leia is captured and held hostage by ...,en,11000000.0,775398007.0,1977-05-25,8.2,16282.0,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Cushing


**Initial data cleaning**

In [790]:
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(columns=['index'],inplace=True)
df.sort_values('vote_count',ascending=False).head()

Unnamed: 0,id,title,overview,original_language,budget,revenue,release_date,vote_average,vote_count,director,lead_actor_1,lead_actor_2,lead_actor_3,lead_actor_4
9244,27205.0,Inception,"Cobb, a skilled thief who commits corporate es...",en,160000000.0,825532800.0,2010-07-15,8.3,30268.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Ken Watanabe,Tom Hardy
17023,157336.0,Interstellar,The adventures of a group of explorers who mak...,en,165000000.0,701729200.0,2014-11-05,8.4,27085.0,Christopher Nolan,Matthew McConaughey,Anne Hathaway,Jessica Chastain,Ellen Burstyn
106,155.0,The Dark Knight,Batman raises the stakes in his war on crime. ...,en,185000000.0,1004558000.0,2008-07-14,8.5,26226.0,Christopher Nolan,Christian Bale,Heath Ledger,Michael Caine,Gary Oldman
19541,293660.0,Deadpool,Deadpool tells the origin story of former Spec...,en,58000000.0,783100000.0,2016-02-09,7.6,25795.0,Tim Miller,Ryan Reynolds,Morena Baccarin,Ed Skrein,T. J. Miller
8539,24428.0,The Avengers,When an unexpected enemy emerges and threatens...,en,220000000.0,1518816000.0,2012-04-25,7.7,25727.0,Joss Whedon,Robert Downey Jr.,Chris Evans,Mark Ruffalo,Chris Hemsworth


In [791]:
df.describe()

Unnamed: 0,id,budget,revenue,vote_average,vote_count
count,30853.0,30853.0,30853.0,30853.0,30853.0
mean,233910.442712,7488630.0,20403490.0,6.109279,502.652838
std,255770.33316,23076110.0,88998610.0,1.51646,1546.045331
min,2.0,0.0,0.0,0.0,0.0
25%,20453.0,0.0,0.0,5.6,36.0
50%,83651.0,0.0,0.0,6.4,89.0
75%,428584.0,2159280.0,1234254.0,7.0,275.0
max,899196.0,380000000.0,2847246000.0,10.0,30268.0


In [792]:
#Combine Actor/director names into one string. This makes for easier preprocessing
#For example, Chris Evans and Chris Hemsworth are treated as separate actors instead of 50% similar
#Also include feature engineering

df['lead_actor_1'] = df['lead_actor_1'].str.replace(' ', '')
df['lead_actor_2'] = df['lead_actor_2'].str.replace(' ', '')
df['lead_actor_3'] = df['lead_actor_3'].str.replace(' ', '')
df['lead_actor_4'] = df['lead_actor_4'].str.replace(' ', '')
df['director'] = df['director'].str.replace(' ', '')
df.head(1)

Unnamed: 0,id,title,overview,original_language,budget,revenue,release_date,vote_average,vote_count,director,lead_actor_1,lead_actor_2,lead_actor_3,lead_actor_4
0,2.0,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,fi,0.0,0.0,1988-10-21,6.9,150.0,AkiKaurismäki,TuroPajala,SusannaHaavisto,MattiPellonpää,EetuHilkamo


In [793]:
#Want to replace "New York City" and "New York" with "NYC"
#Replace World War II with single word


df['overview'] = df['overview'].str.replace('New York City', 'NYC', regex=False)
df['overview'] = df['overview'].str.replace('New York', 'NYC', regex=False)
df['overview'] = df['overview'].str.replace('World War II', 'worldwarii', regex=False)
df['overview'] = df['overview'].str.replace('World War 2', 'worldwarii', regex=False)
df['overview'] = df['overview'].str.replace('WWII', 'worldwarii', regex=False)

Combine lead actors, directors, and plot overview into one singular bag of words

In [794]:
cols = ['overview', 'lead_actor_1', 'lead_actor_2','lead_actor_3','lead_actor_4','director']
df['bag_of_words'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df.loc[19710]['bag_of_words']

'As the Avengers and their allies have continued to protect the world from threats too large for any one hero to handle, a new danger has emerged from the cosmic shadows: Thanos. A despot of intergalactic infamy, his goal is to collect all six Infinity Stones, artifacts of unimaginable power, and use them to inflict his twisted will on all of reality. Everything the Avengers have fought for has led up to this moment - the fate of Earth and existence itself has never been more uncertain. RobertDowneyJr. ChrisHemsworth MarkRuffalo ChrisEvans AnthonyRusso'

In [795]:
#Filter movies that are in English and of a certain popularity. This is for simplicity
df2 = df[(df['original_language'] == 'en') & (df['vote_count'] > 1000)]
df2.reset_index(inplace=True)
df2.head()

Unnamed: 0,index,id,title,overview,original_language,budget,revenue,release_date,vote_average,vote_count,director,lead_actor_1,lead_actor_2,lead_actor_3,lead_actor_4,bag_of_words
0,2,5.0,Four Rooms,It's Ted the Bellhop's first night on the job....,en,4000000.0,4257354.0,1995-12-09,5.7,2037.0,AllisonAnders,TimRoth,JenniferBeals,AntonioBanderas,ValeriaGolino,It's Ted the Bellhop's first night on the job....
1,4,11.0,Star Wars,Princess Leia is captured and held hostage by ...,en,11000000.0,775398007.0,1977-05-25,8.2,16282.0,GeorgeLucas,MarkHamill,HarrisonFord,CarrieFisher,PeterCushing,Princess Leia is captured and held hostage by ...
2,5,12.0,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",en,94000000.0,940335536.0,2003-05-30,7.8,15665.0,AndrewStanton,AlbertBrooks,EllenDeGeneres,AlexanderGould,WillemDafoe,"Nemo, an adventurous young clownfish, is unexp..."
3,6,13.0,Forrest Gump,A man with a low IQ has accomplished great thi...,en,55000000.0,677387716.0,1994-07-06,8.5,21605.0,RobertZemeckis,TomHanks,RobinWright,GarySinise,MykeltiWilliamson,A man with a low IQ has accomplished great thi...
4,7,14.0,American Beauty,"Lester Burnham, a depressed suburban father in...",en,15000000.0,356296601.0,1999-09-15,8.0,9769.0,SamMendes,KevinSpacey,AnnetteBening,ThoraBirch,WesBentley,"Lester Burnham, a depressed suburban father in..."


In [796]:
sum([len(d.split(' ')) for d in df2['bag_of_words']])

155019

**More Feature Engineering**

There are other movie features that will help boost the similarity scores of like-films. The following variables will be added (as a text term to add to bag of words):

- bigbudget: If a movie has an inflation adjusted budget of over $220,000,000 (based on an inflation rate of [3.1%](https://inflationdata.com/Inflation/Inflation/DecadeInflation.asp#/)), it gets flagged
- popularmovie: If a movie has over 11000 votes, it gets flagged as a popular movie. This metric is used rather than gross because in today's age, numerous movies are straight to streaming, thus having no box office data
- highlyrated: If a movie has a rating of over 8.0, it gets flagged as a highly rated movie.
- Oldfilm: Movies released prior to 1950 will be flagged as "oldfilm". There will be some users that have a particular affinity for the classics so having some type of indicator could potentially add value.

The thresholds seem arbitrary but they are nice, round numbers that are near the 95th percentile of each data point. The exception being films released prior to 1950, which was the 1st percentile of data.

In [797]:
df2.describe()

Unnamed: 0,index,id,budget,revenue,vote_average,vote_count
count,2960.0,2960.0,2960.0,2960.0,2960.0,2960.0
mean,10179.168581,126018.935473,48666240.0,168403500.0,6.75098,3670.899662
std,8865.109293,172452.325217,52411090.0,233251800.0,0.754985,3588.426908
min,2.0,5.0,0.0,0.0,2.9,1001.0
25%,1892.5,4976.0,11000000.0,31993310.0,6.2,1416.75
50%,7136.0,18483.5,30000000.0,93301490.0,6.7,2321.5
75%,18379.0,244314.75,69000000.0,207623100.0,7.3,4439.0
max,29692.0,791373.0,380000000.0,2847246000.0,8.7,30268.0


In [798]:
df2['release_date'] = pd.to_datetime(df2['release_date'])
df2['year'] = pd.DatetimeIndex(df2['release_date']).year

df2['budget_2021'] = df2['budget'] * 1.031 ** (2021 - df2['year'])
df2.loc[df2['budget_2021'] >= 22000000, 'bigbudget'] = 'bigbudget'
df2.loc[df2['budget_2021'] < 220000000, 'bigbudget'] = ''

df2.loc[df2['vote_count'] >= 11000, 'popularmovie'] = 'popularmovie'
df2.loc[df2['vote_count'] < 11000, 'popularmovie'] = ''

df2.loc[df2['vote_average'] >= 8, 'highlyrated'] = 'highlyrated'
df2.loc[df2['vote_average'] < 8, 'highlyrated'] = ''

df2['decade'] = df2['year'] - (df2['year']%10)
df2.loc[df2['year'] < 1950 , 'oldfilm'] = 'oldfilm'
df2.loc[df2['year'] >= 1950 , 'oldfilm'] = ''

df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['release_date'] = pd.to_datetime(df2['release_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['year'] = pd.DatetimeIndex(df2['release_date']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['budget_2021'] = df2['budget'] * 1.031 ** (2021 - df2['year'])
A value is trying to be

Unnamed: 0,index,id,title,overview,original_language,budget,revenue,release_date,vote_average,vote_count,...,lead_actor_3,lead_actor_4,bag_of_words,year,budget_2021,bigbudget,popularmovie,highlyrated,decade,oldfilm
0,2,5.0,Four Rooms,It's Ted the Bellhop's first night on the job....,en,4000000.0,4257354.0,1995-12-09,5.7,2037.0,...,AntonioBanderas,ValeriaGolino,It's Ted the Bellhop's first night on the job....,1995,8846781.0,,,,1990,
1,4,11.0,Star Wars,Princess Leia is captured and held hostage by ...,en,11000000.0,775398007.0,1977-05-25,8.2,16282.0,...,CarrieFisher,PeterCushing,Princess Leia is captured and held hostage by ...,1977,42147710.0,,popularmovie,highlyrated,1970,
2,5,12.0,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",en,94000000.0,940335536.0,2003-05-30,7.8,15665.0,...,AlexanderGould,WillemDafoe,"Nemo, an adventurous young clownfish, is unexp...",2003,162848500.0,,popularmovie,,2000,
3,6,13.0,Forrest Gump,A man with a low IQ has accomplished great thi...,en,55000000.0,677387716.0,1994-07-06,8.5,21605.0,...,GarySinise,MykeltiWilliamson,A man with a low IQ has accomplished great thi...,1994,125414200.0,,popularmovie,highlyrated,1990,
4,7,14.0,American Beauty,"Lester Burnham, a depressed suburban father in...",en,15000000.0,356296601.0,1999-09-15,8.0,9769.0,...,ThoraBirch,WesBentley,"Lester Burnham, a depressed suburban father in...",1999,29361750.0,,,highlyrated,1990,


In [799]:
df2['year'].quantile(0.01)

1952.59

In [800]:
cols = ['bag_of_words', 'bigbudget', 'popularmovie','highlyrated','oldfilm']
df2['bag_of_words2'] = df2[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df2.loc[2369]['bag_of_words2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['bag_of_words2'] = df2[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


'As the Avengers and their allies have continued to protect the world from threats too large for any one hero to handle, a new danger has emerged from the cosmic shadows: Thanos. A despot of intergalactic infamy, his goal is to collect all six Infinity Stones, artifacts of unimaginable power, and use them to inflict his twisted will on all of reality. Everything the Avengers have fought for has led up to this moment - the fate of Earth and existence itself has never been more uncertain. RobertDowneyJr. ChrisHemsworth MarkRuffalo ChrisEvans AnthonyRusso bigbudget popularmovie highlyrated '

In [801]:
df_short = df2[['title','bag_of_words2']]
df_short

Unnamed: 0,title,bag_of_words2
0,Four Rooms,It's Ted the Bellhop's first night on the job....
1,Star Wars,Princess Leia is captured and held hostage by ...
2,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp..."
3,Forrest Gump,A man with a low IQ has accomplished great thi...
4,American Beauty,"Lester Burnham, a depressed suburban father in..."
...,...,...
2955,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...
2956,The Kissing Booth 3,"It’s the summer before Elle heads to college, ..."
2957,Borat Subsequent Moviefilm,14 years after making a film about his journey...
2958,Outside the Wire,"In the near future, a drone pilot is sent into..."


**Preprocessing Data**
- Stop words (like, is, the, etc) are removed
- Text is cleaned (remove punctuation, all lower case, amongst others), to ensure apples-to-apples comparisons
- Lemmatization is performed (combine like words such as run and running)

In [802]:
def cleantext(df):
    whitespace_token = RegexpTokenizer("\s+|[\.?!-]|[\u2026]", gaps=True) #split on white space, (.?!), and (...)
    df['token_doc'] = df['bag_of_words2'].apply(whitespace_token.tokenize)
    stop_words = list(ENGLISH_STOP_WORDS)
    df['clean_doc'] = df['token_doc'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
    
    def clean_data(data):
        lst = []
        for word in data:

            word = word.lower()
            word = re.sub(r'''[!()[\]{};?@#$%:\'\"\,.^&*_`]''','',word) #remove all punctuation 
            word = re.sub(r'\w*\d\w*','',word) #remove all number and words containing numbers
            word = re.sub(u'[\u201C]|[\u201D]','',word) #remove special "" with unicode
            word = re.sub(r'(^| ).( |$)','',word) #remove any character that is one unit in length

            lst.append(word)
        return lst

    def remove_white_space(data):
        while '' in data:
            data.remove('')
        return data

    df['clean_doc'] = df['clean_doc'].apply(clean_data)
    df['clean_doc'] = df['clean_doc'].apply(remove_white_space)
    
    #Lemmatize words to group like words (such as run and running)
    wordnet_lemmatizer = WordNetLemmatizer()
    df['clean_doc'] = df['clean_doc'].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])
    
    df_clean = df['clean_doc']
    
    df_clean = df_clean.apply(lambda x: ' '.join(map(str,x)))
    
    return df_clean

In [803]:
df_clean = cleantext(df_short)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['token_doc'] = df['bag_of_words2'].apply(whitespace_token.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_doc'] = df['token_doc'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_doc'] = df['clean_

In [804]:
df_clean[2369]

'avenger ally continued protect world threat large hero handle new danger emerged cosmic shadow thanos despot intergalactic infamy goal collect infinity stone artifact unimaginable power use inflict twisted reality avenger fought led moment fate earth existence uncertain robertdowneyjr chrishemsworth markruffalo chrisevans anthonyrusso bigbudget popularmovie highlyrated'

In [805]:
df2.loc[2369]['bag_of_words2']

'As the Avengers and their allies have continued to protect the world from threats too large for any one hero to handle, a new danger has emerged from the cosmic shadows: Thanos. A despot of intergalactic infamy, his goal is to collect all six Infinity Stones, artifacts of unimaginable power, and use them to inflict his twisted will on all of reality. Everything the Avengers have fought for has led up to this moment - the fate of Earth and existence itself has never been more uncertain. RobertDowneyJr. ChrisHemsworth MarkRuffalo ChrisEvans AnthonyRusso bigbudget popularmovie highlyrated '

In [806]:
df_short.head()

Unnamed: 0,title,bag_of_words2,token_doc,clean_doc
0,Four Rooms,It's Ted the Bellhop's first night on the job....,"[It's, Ted, the, Bellhop's, first, night, on, ...","[it, ted, bellhop, night, job, hotel, unusual,..."
1,Star Wars,Princess Leia is captured and held hostage by ...,"[Princess, Leia, is, captured, and, held, host...","[princess, leia, captured, held, hostage, evil..."
2,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...","[Nemo,, an, adventurous, young, clownfish,, is...","[nemo, adventurous, young, clownfish, unexpect..."
3,Forrest Gump,A man with a low IQ has accomplished great thi...,"[A, man, with, a, low, IQ, has, accomplished, ...","[man, low, iq, accomplished, great, thing, lif..."
4,American Beauty,"Lester Burnham, a depressed suburban father in...","[Lester, Burnham,, a, depressed, suburban, fat...","[lester, burnham, depressed, suburban, father,..."


**Build initial Doc Term Matrix**

In [835]:
cv = CountVectorizer(max_df=.7)

doc_word_matrix_cv = cv.fit_transform(df_clean)
doc_term_cv = pd.DataFrame(doc_word_matrix_cv.toarray(), index = df_short['title'],columns=cv.get_feature_names())
vocab_cv = cv.get_feature_names()

In [836]:
#Ultimately, TFIDF is being used
tfidf = TfidfVectorizer(max_df=.7)

doc_word_matrix_tf = tfidf.fit_transform(df_clean)
doc_term = pd.DataFrame(doc_word_matrix_tf.toarray(), index = df_short['title'],columns=tfidf.get_feature_names())
vocab = tfidf.get_feature_names()

In [808]:
doc_term.head()

Unnamed: 0_level_0,aames,aang,aaranthomas,aaron,aaronblaise,aaronburns,aaroneckhart,aaronhann,aaronpaul,aaronruell,...,zorin,zorro,zoëbell,zoëkravitz,zula,zune,æon,émigré,ólafurdarriólafsson,óscarjaenada
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Four Rooms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Star Wars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Finding Nemo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Forrest Gump,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Beauty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [818]:
doc_term['worldwarii'].sort_values(ascending=False)

title
Enemy at the Gates              0.216692
Unbroken                        0.212004
Schindler's List                0.196124
The Way Back                    0.195955
Letters from Iwo Jima           0.193153
                                  ...   
In the Line of Fire             0.000000
Conan the Barbarian             0.000000
Thank You for Smoking           0.000000
Jerry Maguire                   0.000000
Zack Snyder's Justice League    0.000000
Name: worldwarii, Length: 2960, dtype: float64

In [809]:
doc_term.shape

(2960, 18801)

In [810]:
doc_term.iloc[2369][doc_term.iloc[2369] > 0][:10]

ally              0.131123
anthonyrusso      0.167844
artifact          0.172093
avenger           0.316788
bigbudget         0.092209
chrisevans        0.139783
chrishemsworth    0.142239
collect           0.153717
continued         0.183999
cosmic            0.172093
Name: Avengers: Infinity War, dtype: float64

**NMF (nonNegative Matrix Factorization) Dimensionality Reduction**

In [879]:
#Get optimal number of topics
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.nmf import Nmf
import operator

texts = df_short['clean_doc']

# Create a dictionary
# In gensim a dictionary is a mapping between words and their integer id
dictionary = Dictionary(texts)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(
    no_below=3,
    no_above=0.85,
    keep_n=5000
)

# Create the bag-of-words format (list of (token_id, token_count))
corpus = [dictionary.doc2bow(text) for text in texts]

# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(5, 25 + 1, 1))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

#nmf = NMF(n_components=10, init=None,alpha=0)

for num in topic_nums:
    nmf_test = Nmf(
        corpus=corpus,
        num_topics=num,
        id2word=dictionary,
        chunksize=2000,
        passes=5,
        kappa=.1,
        minimum_probability=0.01,
        w_max_iter=300,
        w_stop_condition=0.0001,
        h_max_iter=100,
        h_stop_condition=0.001,
        eval_every=10,
        normalize=True,
        random_state=42
    )
    
    # Run the coherence model to get the score
    cm = CoherenceModel(
        model=nmf_test,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    coherence_scores.append(round(cm.get_coherence(), 5))

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=operator.itemgetter(1), reverse=True)[0][0]

print(best_num_topics)

18


In [881]:
sorted(scores, key=operator.itemgetter(1), reverse=True)

[(18, 0.3047),
 (11, 0.30369),
 (13, 0.29965),
 (14, 0.29597),
 (12, 0.29072),
 (23, 0.29037),
 (17, 0.28958),
 (15, 0.28881),
 (9, 0.28751),
 (22, 0.2861),
 (25, 0.28578),
 (6, 0.28552),
 (24, 0.28415),
 (7, 0.28163),
 (10, 0.2774),
 (16, 0.27401),
 (20, 0.27386),
 (19, 0.27284),
 (8, 0.26693),
 (21, 0.26475),
 (5, 0.25635)]

In [885]:
nmf_terms=18
nmf = NMF(n_components=nmf_terms, init=None,alpha=0)

doc_topic_nmf = nmf.fit_transform(doc_term)

In [883]:
doc_topic_nmf.shape

(2960, 18)

In [884]:
doc_topic_nmf_df = pd.DataFrame(doc_topic_nmf, index = df_short['title']).add_prefix('Topic_')
doc_topic_nmf_df.head()

Unnamed: 0_level_0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Four Rooms,0.0,0.0,0.004585,0.005059,0.006952,0.000202,0.000691,0.002861,0.025751,0.001184,0.008217,0.007176,0.002171,0.004089,0.004173,0.001593,0.011805,0.0
Star Wars,0.0,0.074705,0.01291,0.0,0.0,0.022981,0.0,0.0,0.0,0.0,0.000868,0.0,0.0,0.00298,0.000229,0.001986,9.2e-05,0.008868
Finding Nemo,0.023456,0.009336,0.001617,0.0,0.0,0.0,0.0,0.0,0.003241,0.013759,0.001641,0.006514,0.003628,0.0,0.0,0.037552,0.0,0.0
Forrest Gump,0.0,0.006771,0.0,0.0,0.0,0.075125,0.062329,0.0,0.0,0.0,0.072243,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Beauty,0.0,0.0,0.0,0.0,0.0,0.0,0.122306,0.0,0.043769,0.0,0.0,0.0,0.0,0.0,0.0,0.081574,0.0,0.0


In [886]:
n_terms = 15     # Select the top 15 words in vocab for each topic.
for idx, topic in enumerate(nmf.components_):
    top_words = [vocab[i].upper() for i in topic.argsort()[:-n_terms-1:-1]]
    print(f"Topic {idx}:\n", ", ".join(top_words), "\n")

Topic 0:
 MAN, YOUNG, WOMAN, MEET, MYSTERIOUS, FIND, WAY, NYC, LOVE, HELP, SET, PAST, DREAM, END, COME 

Topic 1:
 EARTH, ALIEN, PLANET, HUMAN, CREW, RACE, SAVE, FORCE, FUTURE, SPACE, SCIENTIST, FIGHT, MISSION, POWERFUL, TEAM 

Topic 2:
 POLICE, COP, DRUG, CRIME, CRIMINAL, DETECTIVE, OFFICER, ANGELES, LOS, GANG, HEIST, PRISON, MURDER, STREET, EX 

Topic 3:
 SCHOOL, HIGH, STUDENT, GIRL, TEACHER, POPULAR, COLLEGE, SENIOR, CRUSH, CLASS, GRADUATION, SPIDER, JOCK, MIDDLE, PRINCIPAL 

Topic 4:
 FATHER, MOTHER, SON, DAUGHTER, WIFE, HOUSE, CHILD, SINGLE, YOUNG, HOME, TEENAGE, DISCOVERS, GIRL, BROTHER, RETURN 

Topic 5:
 YEAR, OLD, LATER, BOY, GIRL, RETURN, SECRET, MEET, REUNITES, OLDER, HE, GRADE, RELATIONSHIP, KILL, BILLY 

Topic 6:
 LIFE, LOVE, DEATH, JUST, CHANGE, MEET, CAREER, JACK, MAKE, HE, DOG, GET, COME, WAY, COUPLE 

Topic 7:
 POPULARMOVIE, BIGBUDGET, EVIL, BATTLE, HARRY, POWER, HIGHLYRATED, QUEEN, KING, PETER, PRINCESS, CAPTAIN, CARRIEFISHER, MARKHAMILL, SKYWALKER 

Topic 8:
 TOWN, S

In [914]:
doc_topic_nmf_df.sort_values('Topic_16',ascending=False).head(5)

Unnamed: 0_level_0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Atomic Blonde,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251145,0.041427
Never Say Never Again,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231739,0.0
Octopussy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044934,0.0,0.033513,0.0,0.0,0.216422,0.032017
Live and Let Die,0.00311,0.0,0.00145,0.0,0.0,0.0,0.0,0.005378,0.000246,0.0,0.0,0.099284,0.005076,0.0,0.0,0.0,0.197255,0.0
From Russia with Love,0.000181,0.0,0.0,0.0,0.0,0.0,0.032958,0.0,0.0,0.0,0.0,0.0,0.003804,0.0,0.0,0.0,0.195831,0.0


In [905]:
doc_topic_nmf_df.idxmax()

Topic_0                         Let Me In
Topic_1                       Pacific Rim
Topic_2                          Triple 9
Topic_3                    21 Jump Street
Topic_4     Jackass Presents: Bad Grandpa
Topic_5                      The Fountain
Topic_6                                Up
Topic_7               Mississippi Burning
Topic_8               No Strings Attached
Topic_9                        Dark Skies
Topic_10                     Spider-Man 3
Topic_11                Coming to America
Topic_12                           Norbit
Topic_13                Girl, Interrupted
Topic_14                         Blade II
Topic_15                       Mamma Mia!
Topic_16                    Atomic Blonde
Topic_17                 We Were Soldiers
dtype: object

In [906]:
doc_topic_nmf_df.max()

Topic_0     0.076051
Topic_1     0.191359
Topic_2     0.191402
Topic_3     0.255784
Topic_4     0.227371
Topic_5     0.182196
Topic_6     0.212556
Topic_7     0.218403
Topic_8     0.286575
Topic_9     0.202674
Topic_10    0.171187
Topic_11    0.263397
Topic_12    0.270651
Topic_13    0.180696
Topic_14    0.417868
Topic_15    0.254443
Topic_16    0.251145
Topic_17    0.248181
dtype: float64

**Recommendation System**

In [907]:
doc_topic_nmf_df

Unnamed: 0_level_0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Four Rooms,0.000000,0.000000,0.004585,0.005059,0.006952,0.000202,0.000691,0.002861,0.025751,0.001184,0.008217,0.007176,0.002171,0.004089,0.004173,0.001593,0.011805,0.000000
Star Wars,0.000000,0.074705,0.012910,0.000000,0.000000,0.022981,0.000000,0.000000,0.000000,0.000000,0.000868,0.000000,0.000000,0.002980,0.000229,0.001986,0.000092,0.008868
Finding Nemo,0.023456,0.009336,0.001617,0.000000,0.000000,0.000000,0.000000,0.000000,0.003241,0.013759,0.001641,0.006514,0.003628,0.000000,0.000000,0.037552,0.000000,0.000000
Forrest Gump,0.000000,0.006771,0.000000,0.000000,0.000000,0.075125,0.062329,0.000000,0.000000,0.000000,0.072243,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
American Beauty,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.122306,0.000000,0.043769,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.081574,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gabriel's Inferno Part II,0.000724,0.000000,0.000000,0.002748,0.003363,0.035487,0.019481,0.002373,0.011983,0.000000,0.007543,0.005259,0.016963,0.002392,0.000571,0.004107,0.002046,0.014096
The Kissing Booth 3,0.000000,0.000000,0.000000,0.018219,0.002916,0.000000,0.001690,0.000714,0.057710,0.002535,0.000000,0.004059,0.004615,0.000000,0.004530,0.000000,0.017905,0.000000
Borat Subsequent Moviefilm,0.029182,0.000000,0.000000,0.000000,0.048413,0.000900,0.058040,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028326,0.000000,0.019707
Outside the Wire,0.002761,0.007892,0.021607,0.000000,0.000000,0.002290,0.000739,0.002122,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021541,0.031946


**Experiment with PCA**
The data shows a very small explained variance ratio even for a large n components, so this technique is not ideal for this project

In [938]:
pca = PCA(n_components=100)

doc_topic_pca = pca.fit_transform(doc_term)
sum(pca.explained_variance_ratio_)

0.11278290465482035

In [939]:
doc_topic_pca_df = pd.DataFrame(doc_topic_pca, index = df_short['title']).add_prefix('Topic_')
doc_topic_pca_df.head()

Unnamed: 0_level_0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,...,Topic_90,Topic_91,Topic_92,Topic_93,Topic_94,Topic_95,Topic_96,Topic_97,Topic_98,Topic_99
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Four Rooms,-0.01873,-0.040939,0.008466,0.0046,-0.00435,-0.008939,-0.01495,-0.063486,0.002974,0.001319,...,0.033168,0.022712,-0.032262,0.023738,-0.017543,0.016023,-0.031439,-0.045096,-0.031385,-0.001976
Star Wars,0.100195,0.026686,0.027232,-0.033688,-0.047757,0.040707,-0.064752,0.00046,0.046207,0.09302,...,0.002912,-0.035425,-0.009562,0.007328,0.037331,0.085088,-0.009777,0.037506,-0.010305,0.027917
Finding Nemo,-0.00457,0.032222,-0.027813,0.030488,-0.058048,0.005325,0.02741,-0.016919,0.012415,0.014907,...,-0.02,0.012504,-0.031002,0.002064,0.009938,0.007649,-0.042979,-0.007331,0.006974,0.000225
Forrest Gump,-0.045621,0.060695,-0.033756,-0.132111,0.010251,0.06395,-0.01965,0.01358,0.046496,0.0535,...,0.025661,0.001675,-0.009396,0.014787,-0.005222,0.000692,-0.028732,-0.003237,-0.054386,0.029085
American Beauty,-0.101889,0.03044,-0.034876,-0.041452,0.090835,-0.037008,-0.001926,0.040107,0.043807,0.002734,...,-0.006731,-0.024435,-0.038157,0.00294,-0.014659,-0.016101,0.037118,-0.014888,-0.016561,-0.023223


**TFIDF CorEx Topic Modeling**

In [834]:
topic_anchors = [['highlyrated'],
                 ['popularmovie','bigbudget'],
                 ['comedy'],
                 ['worldwarii'],
                 ['vampire','werewolf']
                ]

corex_model = ct.Corex(n_hidden=20, words=vocab, seed=42,
                 anchors=topic_anchors, 
                 anchor_strength=10
                      )
corex_model.fit(doc_word_matrix_tf, words=vocab)

topics = corex_model.get_topics()

for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('Topic {}:'.format(n+1))
    print(', '.join(topic_words), '\n')

Topic 1:
humbert, shaun, truman, michaelmoore, greek, jims, stifler, bennettmiller, bush, salander 

Topic 2:
aladdin, ianmckellen, bilbo, jonathangroff, kristoff, rogermoore, selene, lycans, idinamenzel, annacathcart 

Topic 3:
rocky, katniss, sylvesterstallone, shrek, decepticons, autobots, creed, johng, avildsen, transformer 

Topic 4:
charliechaplin, tramp, wick, katiefeatherston, henryjoost, earl, terry, oldfilm, dumbledore, voldemorts 

Topic 5:
alfredhitchcock, sidney, mollyringwald, vienna, orsonwelles, celine, ralph, mollie, vincemarcello, jacobelordi 

Topic 6:
plant, fester, ll, foreman, cocktail, lighthouse, lost, caul, nada, freeway 

Topic 7:
jigsaw, smurfs, tobinbell, nemo, billywilder, luthor, costasmandylor, betsyrussell, gal, austin 

Topic 8:
thor, imperial, melody, garfield, lena, mermaid, aidangillen, commits, mulan, markhamill 

Topic 9:
myers, laurie, warren, halloween, sal, lorraine, robocop, haddonfield, jamieleecurtis, scully 

Topic 10:
chuck, troll, chipmunk

**Recommendation System**

In [832]:
##Teset out CorEx recommendation system
def recsystem_corex(x,num_matches):
    distarray = pairwise_distances(doc_topic_corex[x].reshape(1,-1),
                             doc_topic_corex,metric='cosine').argsort()[0][1:num_matches+1]
    df_out = df2[['title']].iloc[distarray,:]
    print('You Enjoyed: {} \n\nYou might like:\n\n {}'
          .format(df2['title'][x],tabulate(df_out, headers=['Idx:','Title:'],tablefmt="plain")))

recsystem_corex(2369,5)

You Enjoyed: Avengers: Infinity War 

You might like:

   Idx:  Title:
  1909  After Earth
  1911  Mr. Peabody & Sherman
  1912  Paranormal Activity 4
  1913  Fast & Furious 6
  1914  Cloud Atlas


In [660]:
def recsystem(x,num_matches):
    distarray = pairwise_distances(doc_topic_nmf[x].reshape(1,-1),
                             doc_topic_nmf,metric='cosine').argsort()[0][1:num_matches+1]
    df_out = df2[['title']].iloc[distarray,:]
    print('You Enjoyed: {} \n\nYou might like:\n\n {}'
          .format(df2['title'][x],tabulate(df_out, headers=['Idx:','Title:'],tablefmt="plain")))


In [909]:
recsystem(2369,10)

You Enjoyed: Avengers: Infinity War 

You might like:

   Idx:  Title:
   279  Harry Potter and the Order of the Phoenix
  2448  Rogue One: A Star Wars Story
   560  Hulk
   370  Toy Story
  2010  The Hobbit: The Battle of the Five Armies
  1958  Avengers: Age of Ultron
  2046  Star Wars: The Force Awakens
   163  Batman & Robin
  1161  Iron Man 2
  1453  The Return of Jafar
