In [91]:
import pandas as pd
import numpy as np
import string

import re

import nltk
from nltk.corpus import stopwords, util
from nltk.tokenize import word_tokenize

#import matplotlib.pyplot as plt
#import seaborn as sns

#import dask.dataframe as dd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [92]:
def open_critic_reviews_csv(critic_reviews_url):
    critic_reviews = pd.read_csv(critic_reviews_url)
    return critic_reviews

def open_movies_csv(movies_url):
    movies = pd.read_csv(movies_url)
    return movies
movies = open_movies_csv(movies_url)

In [93]:
def print_orig_columns_na(movies, critic_reviews):
    print("critic_reviews.csv NA ccount: ", critic_reviews.isna().sum())
    print("critic_reviews.csv length: ", len(critic_reviews))
    print(" ")
    print("movies.csv NA count: ", movies.isna().sum())
    print("movies.csv length: ", len(movies))
#print_orig_columns_na(movies, critic_reviews)

# Functions for Text Cleaning

In [94]:
def comma_to_space(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(',', ' '))
    return text_df

In [95]:
def remove_spaces(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(' ', ''))
    return text_df

In [96]:
def drop_movie_dups(df):
    df.drop_duplicates('movie_title').reset_index(drop=True, inplace=True)
    return df

In [97]:
movies=drop_movie_dups(movies)

# Creating Count Vector

In [98]:
def define_count_features(df, feature_list):
    count_feature_list = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_features = df[count_feature_list]
    return count_features

In [99]:
#count_features = define_count_features(movies)
#count_features

In [100]:
def clean_count_features(count_features):
    clean_features = count_features.fillna(' ', inplace=False)
    for col in clean_features:
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(' ', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(',', ' '))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).lower())
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('[^\w\s]+',''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('&', ' '))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('.', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('<', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('>', ''))
    return clean_features

In [101]:
#clean_features=clean_count_features(count_features)
#clean_features

In [102]:
def create_count_bow(clean_features):
    count_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_bow = clean_features[count_features].agg(' '.join, axis=1)
    return count_bow

In [103]:
#count_bow = create_count_bow(clean_features)
#count_bow

In [104]:
def tokenize(bow):
    token_array=[]
    for text in bow:
        tokens=word_tokenize(text)
        token_array.append(tokens)
    token_df = pd.DataFrame(token_array)
    token_df.replace(to_replace=[None], value=' ', inplace=True)
    return token_df

In [105]:
#tokenized=tokenize(count_bow)
#tokenized

In [106]:
def count_vectorize(count_bow):
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(count_bow)
    count_transform = count_vectorizer.transform(count_bow)
    soup_count_array = count_transform.toarray()
    count_vector = pd.DataFrame(soup_count_array)
    return count_vector

In [107]:
count_vector = count_vectorize(count_bow)
count_vector

NameError: name 'count_bow' is not defined

In [108]:
def count_vect_compact(count_vect):
    count_vect_compact = soup_count_df.loc[:,(soup_count_df.sum(axis=0) > 1)]
    return count_vect_compact

In [109]:
#count_vect_compact=count_vect_compact(soup_count_df)
#count_vect_compact

### By only selecting terms (columns) that are included in more than one movie, we reduce a lot of unnecessary computations

# Creating TF-IDF Vector

In [111]:
def define_tfidf_matrix(df):
    tfidf_feature_list = ['movie_info']
    tfidf_features = df[tfidf_feature_list]
    tfidf_features.fillna(' ', inplace=True, axis=0)
    tfidf_matrix = df[tfidf_features]
    return tfidf_matrix

In [112]:
#tfidf_matrix = define_tfidf_matrix(movies)
#tfidf_matrix

In [113]:
def clean_tfidf_matrix(tfidf_matrix):
    cleaned_tfidf_matrix = tfidf_matrix.apply(lambda x: remove_punctuation(x))
    cleaned_tfidf_matrix = pd.Series(cleaned_tfidf_matrix)
    cleaned_tfidf_matrix = lowercase_text(cleaned_tfidf_matrix)
    return clean_tfidf_matrix

In [114]:
#clean_tfidf_matrix = clean_tfidf_matrix(movies['movie_info'])
#clean_tfidf_matrix

In [115]:
def create_tfidf_bow(clean_tfidf_matrix):
    tfidf_bow = pd.Series([y for x in clean_tfidf_matrix.values.flatten() for y in x.split()]).value_counts()
    return tfidf_bow

In [116]:
#tfidf_bow = create_tfidf_bow(clean_tfidf_matrix)
#tfidf_bow

In [117]:
def tfidf_vectorize(tfidf_bow):
    tfidf_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
    tfidf_vectorizer.fit(tfidf_bow)
    tfidf_transform = tfidf_vectorizer.transform(tfidf_bow)
    soup_tfidf_array = tfidf_transform.toarray()
    soup_tfidf_df = pd.DataFrame(soup_tfidf_array)
    return soup_tfidf_df

In [118]:
#soup_tfidf_df = tfidf_vectorize(tfidf_bow)

# Recommendation Function

In [119]:
def movie_recommendation(title, cosine_sim):
    title=title.replace(' ', '',regex=True).lower()
    index=movies_no_dups.index
    titles=pd.Series(titles.replace(' ','',regex=True).lower())
    indices=pd.Series(index, index=titles)
    idx = indices[title]
    sim_list = list(enumerate(cosine_sim[idx]))
    sim_list = sorted(sim_list, key=lambda x: x[1], reverse=True)
    top_10_list = sim_list[1:11]
    movie_indices = [i[0] for i in top_10_list]
    return titles.iloc[movie_indices]

# Set Features for Model


In [122]:
def main():
    favorite_movie='Pulp Fiction'
    movies_url = 'https://raw.githubusercontent.com/robertrindos/Recommendation-System/main/rotten_tomatoes_movies.csv'
    #critic_reviews_url = 'poop'
    
    # Opens and reads CSV files
    movies = open_movies_csv(movies_url)
    
    
    # Defining features and title
    count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    titles = movies['movie_title']
    
    # Drops duplicate values
    movies_no_dups = drop_movie_dups(movies)
    
    # Defines features for creating word-count matrix
    count_features = define_count_features(movies_no_dups, count_vec_features)
    
    # Text processing/cleaning for count matrix
    clean_features = clean_count_features(count_features)
    
    # Creates a 'Bag of Words' for each movie, this combines all inlcuded terms
    count_bow = create_count_bow(clean_features)
    
    # Creates the count matrix
    count_vector = count_vectorize(count_bow)
    
    compact_vector=count_vect_compact(count_vector)
    
    cosine_sim = cosine_similarity(compact_vector, compact_vector)
    
    recommendation_list = movie_recommendation(favorite_movie, cosine_sim)
    print(recommendation_list)

if __name__ == "__main__":
    main()

MemoryError: Unable to allocate 29.5 GiB for an array with shape (17712, 223206) and data type int64