In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
import string
import os
import pickle
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




In [5]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
os.chdir("/content/drive/My Drive/Colab Notebooks/scripts")

In [5]:
def clean_text(script_list):
  cleaned_text = []
  for script in tqdm(script_list):
    script_tokens = word_tokenize(script)
    cleaned_movie = ""
    
    for word in script_tokens:
      if word not in string.punctuation and word not in stopwords.words():
            cleaned_movie += word
            cleaned_movie += " "
    cleaned_text.append(cleaned_movie)
    
  return cleaned_text

In [6]:
def bag_of_words(text_data, stop_list = None, max_doc_freq = 1.0):
    cv = CountVectorizer(stop_words = stop_list, max_df = max_doc_freq)
    bow = cv.fit_transform(text_data)
    words = cv.get_feature_names()
    return bow, words

In [7]:
def get_relevant_documents(query_title, n = 10, mdf = 1.0):
    # titles, script_text = create_corpus()
    # cleaned_scripts = clean_text(script_text)

    title_idx = movie_titles.index(query_title.lower())
    if (title_idx == -1):
        return []

    bow, words = bag_of_words(cleaned_scripts, max_doc_freq = mdf)
    vector_space = bow.toarray()/np.sum(bow.toarray(), axis = 0)

    query = np.zeros(len(vector_space[0]))
    query_script = cleaned_scripts[title_idx]
    for word in query_script.split():
        if word in words:
            query[words.index(word)] += 1
    query /= np.sum(bow.toarray(), axis = 0)


    similarity = np.zeros(len(vector_space))
    for i, row in enumerate(vector_space):
        similarity[i] = cosine_similarity(row.reshape(1,-1), query.reshape(1,-1))
        similarity[i] += cosine_similarity(movie_sentiment[i].reshape(1, -1), movie_sentiment[title_idx].reshape(1, -1))

    weighted_similarity = similarity / np.sort(similarity)[::-1][0]
    top_n_movies = [movie_titles[idx] for idx in np.argsort(weighted_similarity)[::-1][1:n+1]]

    return top_n_movies

Load Movies

In [8]:
movie_scripts = []
movie_titles = []
for file_name in os.listdir():
  with open(file_name, "r") as movie_script:
    movie_scripts.append(movie_script.read())
    movie_titles.append(file_name[:-4].lower())

In [9]:
cleaned_scripts = clean_text(movie_scripts)


100%|██████████| 95/95 [2:02:20<00:00, 77.27s/it]


Sentiment Analysis

In [10]:
sentiment_analyzer = SentimentIntensityAnalyzer()
movie_sentiment = []
for title, script in tqdm(zip(movie_titles, cleaned_scripts)):
  sentiment_dict = sentiment_analyzer.polarity_scores(script)
  sentiment_vector = np.array([sentiment_dict['neg'], sentiment_dict['neu'], sentiment_dict['pos']])  # negative, neutral, positive
  movie_sentiment.append(sentiment_vector)

95it [01:33,  1.02it/s]


Get recommendations and save

In [11]:
rec_dict = {}
for title in tqdm(movie_titles):
  rec_dict[title] = get_relevant_documents(title, 5)

with open('rec_dict_final.pickle', 'wb') as handle:
    pickle.dump(rec_dict, handle)

100%|██████████| 95/95 [26:50<00:00, 16.96s/it]


In [7]:
with open('rec_dict_final.pickle', 'rb') as handle:
    rec_dict = pickle.load(handle)
    print(rec_dict)

{'honeydripper': ['sweet-smell-of-success', 'passion-of-joan-of-arc,-the', 'christ-complex', "all-the-king's-men", 'wag-the-dog'], 'colombiana': ['mud', 'sweet-hereafter,-the', 'avengers,-the', 'syriana', 'law-abiding-citizen'], 'addams-family,-the': ['life-as-a-house', 'syriana', 'law-abiding-citizen', 'man-on-the-moon', 'funny-people'], 'mighty-morphin-power-rangers:-the-movie': ['elephant-man,-the', 'guardians-of-the-galaxy-vol-2', 'innerspace', 'american-milkshake', 'man-on-the-moon'], 'above-the-law': ['life-as-a-house', 'deer-hunter,-the', 'law-abiding-citizen', 'colombiana', 'syriana'], 'law-abiding-citizen': ['spider-man', 'life-as-a-house', 'above-the-law', 'indiana-jones-and-the-raiders-of-the-lost-ark', 'syriana'], 'sweet-smell-of-success': ["all-the-king's-men", 'backdraft', 'drive', 'christ-complex', 'bringing-out-the-dead'], 'omega-man': ['elephant-man,-the', 'guardians-of-the-galaxy-vol-2', 'innerspace', 'american-milkshake', 'man-on-the-moon'], 'star-wars:-the-phantom-m