In [2]:
import pandas as pd
import numpy as np
import string

import re

import nltk
from nltk.corpus import stopwords, util
from nltk.tokenize import word_tokenize

#import matplotlib.pyplot as plt
#import seaborn as sns

import dask.dataframe as dd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
favorite_movie1='Pulp Fiction'
favorite_movie2='Django Unchained'
favorite_movie3='Dude Wheres my car?'
movies_url = 'https://raw.githubusercontent.com/robertrindos/Recommendation-System/main/rotten_tomatoes_movies.csv'
   

In [4]:
def open_critic_reviews_csv(critic_reviews_url):
    critic_reviews = pd.read_csv(critic_reviews_url)
    return critic_reviews

def open_movies_csv(movies_url):
    movies = pd.read_csv(movies_url)
    return movies
movies = open_movies_csv(movies_url)

In [5]:
def print_orig_columns_na(movies, critic_reviews):
    print("critic_reviews.csv NA ccount: ", critic_reviews.isna().sum())
    print("critic_reviews.csv length: ", len(critic_reviews))
    print(" ")
    print("movies.csv NA count: ", movies.isna().sum())
    print("movies.csv length: ", len(movies))
#print_orig_columns_na(movies, critic_reviews)

# Functions for Text Cleaning

In [6]:
def comma_to_space(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(',', ' '))
    return text_df

In [7]:
def remove_spaces(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(' ', ''))
    return text_df

In [8]:
def drop_movie_dups(df):
    df.drop_duplicates('movie_title').reset_index(drop=True, inplace=True)
    return df

In [9]:
movies=drop_movie_dups(movies)

# Creating Count Vector

In [10]:
def define_count_features(df, count_vec_features):
    count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_features = df[count_vec_features]
    return count_features

In [11]:
count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
count_features = define_count_features(movies,count_vec_features)
#count_features

In [12]:
def clean_count_features(count_features):
    clean_features = count_features.fillna(' ', inplace=False)
    for col in clean_features:
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(' ', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(',', ' '))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).lower())
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('[^\w\s]+',''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('&', ' '))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('.', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('<', ''))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('>', ''))
    return clean_features

In [13]:
clean_features=clean_count_features(count_features)
#clean_features

In [14]:
def create_count_bow(clean_features):
    count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_bow = clean_features[count_vec_features].agg(' '.join, axis=1)
    return count_bow

In [15]:
count_bow = create_count_bow(clean_features)
#count_bow

In [16]:
def tokenize(bow):
    token_array=[]
    for text in bow:
        tokens=word_tokenize(text)
        token_array.append(tokens)
    token_df = pd.DataFrame(token_array)
    token_df.replace(to_replace=[None], value=' ', inplace=True)
    return token_df

In [17]:
#tokenized=tokenize(count_bow)
#tokenized

In [18]:
def count_vectorize(count_bow):
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(count_bow)
    count_transform = count_vectorizer.transform(count_bow)
    soup_count_array = count_transform.toarray()
    count_vector = pd.DataFrame(soup_count_array)
    return count_vector

In [19]:
count_vector = count_vectorize(count_bow)
#count_vector

In [25]:
def count_vect_compact(count_vect):
    compact_vector = count_vect.loc[:,(count_vect.sum(axis=0) > 1)]
    compact_vector = pd.DataFrame(compact_vector)
    return compact_vector

In [26]:
compact_vector=count_vect_compact(count_vector)


In [27]:
print(compact_vector)

       2       4       10      14      15      16      18      19      20      \
0           0       0       0       0       1       0       0       0       0   
1           0       0       0       0       0       0       0       0       0   
2           0       0       0       0       0       0       0       0       0   
3           0       0       0       0       0       0       0       0       0   
4           0       0       0       0       0       0       0       0       0   
...       ...     ...     ...     ...     ...     ...     ...     ...     ...   
17707       0       0       0       0       0       0       0       0       0   
17708       0       0       0       0       0       0       0       0       0   
17709       0       0       0       0       0       0       0       0       0   
17710       0       0       0       0       0       0       0       0       0   
17711       0       0       0       0       0       0       0       0       0   

       29      ...  223172 

### By only selecting terms (columns) that are included in more than one movie, we reduce a lot of unnecessary computations

# Recommendation Function

In [28]:
def movie_recommendation(title, cosine_sim):
    title=title.replace(' ', '',regex=True).lower()
    index=movies_no_dups.index
    titles=pd.Series(titles.replace(' ','',regex=True).lower())
    indices=pd.Series(index, index=titles)
    idx = indices[title]
    sim_list = list(enumerate(cosine_sim[idx]))
    sim_list = sorted(sim_list, key=lambda x: x[1], reverse=True)
    top_10_list = sim_list[1:11]
    movie_indices = [i[0] for i in top_10_list]
    return titles.iloc[movie_indices]

In [29]:
vector_ddf =dd.from_pandas(compact_vector, chunksize=10000)

In [None]:
cosine_sim = cosine_similarity(vector_ddf, vector_ddf)

In [None]:
favorite_movie1='Pulp Fiction'
recommendation_list1 = movie_recommendation(favorite_movie1, cosine_sim)
    print(recommendation_list1)

# Set Features for Model


In [29]:
def main():
    favorite_movie1='Pulp Fiction'
    favorite_movie2='Django Unchained'
    favorite_movie3='Dude Wheres my car?'
    movies_url = 'https://raw.githubusercontent.com/robertrindos/Recommendation-System/main/rotten_tomatoes_movies.csv'
    #critic_reviews_url = 'poop'
    
    # Opens and reads CSV files
    movies = open_movies_csv(movies_url)
    
    
    # Defining features and title
    count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    titles = movies['movie_title']
    
    # Drops duplicate values
    movies_no_dups = drop_movie_dups(movies)
    
    # Defines features for creating word-count matrix
    count_features = define_count_features(movies_no_dups, count_vec_features)
    
    # Text processing/cleaning for count matrix
    clean_features = clean_count_features(count_features)
    
    # Creates a 'Bag of Words' for each movie, this combines all inlcuded terms
    count_bow = create_count_bow(clean_features)
    
    # Creates the count matrix
    count_vector = count_vectorize(count_bow)
    
    compact_vector=count_vect_compact(count_vector)
    
    cosine_sim = cosine_similarity(compact_vector, compact_vector)
    
    recommendation_list1 = movie_recommendation(favorite_movie1, cosine_sim)
    print(recommendation_list1)
    
    recommendation_list2 = movie_recommendation(favorite_movie2, cosine_sim)
    print(recommendation_list2)
    
    recommendation_list3 = movie_recommendation(favorite_movie3, cosine_sim)
    print(recommendation_list3)

if __name__ == "__main__":
    main()

TypeError: 'DataFrame' object is not callable