In [34]:
import pandas as pd
import numpy as np
import string

import re

import nltk
from nltk.corpus import stopwords, util
from nltk.tokenize import word_tokenize

#import matplotlib.pyplot as plt
#import seaborn as sns

#import dask.dataframe as dd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [35]:
def open_csv_files():
    critic_reviews = pd.read_csv(critic_reviews_url)
    movies = pd.read_csv(movies_url)
    return critic_reviews, movies
#critic_reviews, movies = open_csv_files()

In [36]:
def print_orig_columns_na(movies, critic_reviews):
    print("critic_reviews.csv NA ccount: ", critic_reviews.isna().sum())
    print("critic_reviews.csv length: ", len(critic_reviews))
    print(" ")
    print("movies.csv NA count: ", movies.isna().sum())
    print("movies.csv length: ", len(movies))
#print_orig_columns_na(movies, critic_reviews)

# Functions for Text Cleaning

In [37]:
def comma_to_space(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(',', ' '))
    return text_df

In [38]:
def remove_spaces(text_df):
    for col in text_df:
        text_df[col].apply(lambda x: str(x).replace(' ', ''))
    return text_df

In [39]:
def drop_movie_dups(df,movie_titles):
    df.drop_duplicates(movie_titles).reset_index(drop=True, inplace=True)
    return df

In [40]:
#movies=drop_movie_dups(movies)

# Creating Count Vector

In [41]:
def define_count_features(df, feature_list):
    count_feature_list = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_features = df[count_feature_list]
    return count_features

In [42]:
#count_features = define_count_features(movies)
#count_features

In [43]:
def clean_count_features(count_features):
    clean_features = count_features.fillna(' ', inplace=False)
    for col in clean_features:
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(' ', '', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace(',', ' ', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).lower())
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('[^\w\s]+','', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('&', ' ', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('.', '', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('<', '', regex=True))
        clean_features[col] = clean_features[col].apply(lambda x: str(x).replace('>', '', regex=True))
    return clean_features

In [44]:
#clean_features=clean_count_features(count_features)
#clean_features

In [45]:
def create_count_bow(clean_features):
    count_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
    count_bow = clean_features[count_features].agg(' '.join, axis=1)
    return count_bow

In [46]:
#count_bow = create_count_bow(clean_features)
#count_bow

In [47]:
def tokenize(bow):
    token_array=[]
    for text in bow:
        tokens=word_tokenize(text)
        token_array.append(tokens)
    token_df = pd.DataFrame(token_array)
    token_df.replace(to_replace=[None], value=' ', inplace=True)
    return token_df

In [48]:
#tokenized=tokenize(count_bow)
#tokenized

In [49]:
def count_vectorize(count_bow):
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(count_bow)
    count_transform = count_vectorizer.transform(count_bow)
    soup_count_array = count_transform.toarray()
    soup_count_df = pd.DataFrame(soup_count_array)
    return count_vector

In [50]:
#count_vector = count_vectorize(count_bow)
#count_vector

In [None]:
def count_vect_compact(soup_count_df):
    count_vect_compact = soup_count_df.loc[:,(soup_count_df.sum(axis=0) > 1)]
    return count_vect_compact

In [None]:
#count_vect_compact=count_vect_compact(soup_count_df)
#count_vect_compact

### By only selecting terms (columns) that are included in more than one movie, we reduce a lot of unnecessary computations

In [None]:
print(soup_count_df.shape)
print(compact_count_vect.shape)
print(compact_count_vect.sum())

# Creating TF-IDF Vector

In [None]:
def define_tfidf_matrix(df):
    tfidf_feature_list = ['movie_info']
    tfidf_features = df[tfidf_feature_list]
    tfidf_features.fillna(' ', inplace=True, axis=0)
    tfidf_matrix = df[tfidf_features]
    return tfidf_matrix

In [None]:
#tfidf_matrix = define_tfidf_matrix(movies)
#tfidf_matrix

In [None]:
def clean_tfidf_matrix(tfidf_matrix):
    cleaned_tfidf_matrix = tfidf_matrix.apply(lambda x: remove_punctuation(x))
    cleaned_tfidf_matrix = pd.Series(cleaned_tfidf_matrix)
    cleaned_tfidf_matrix = lowercase_text(cleaned_tfidf_matrix)
    return clean_tfidf_matrix

In [None]:
#clean_tfidf_matrix = clean_tfidf_matrix(movies['movie_info'])
#clean_tfidf_matrix

In [None]:
def create_tfidf_bow(clean_tfidf_matrix):
    tfidf_bow = pd.Series([y for x in clean_tfidf_matrix.values.flatten() for y in x.split()]).value_counts()
    return tfidf_bow

In [None]:
#tfidf_bow = create_tfidf_bow(clean_tfidf_matrix)
#tfidf_bow

In [None]:
def tfidf_vectorize(tfidf_bow):
    tfidf_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
    tfidf_vectorizer.fit(tfidf_bow)
    tfidf_transform = tfidf_vectorizer.transform(tfidf_bow)
    soup_tfidf_array = tfidf_transform.toarray()
    soup_tfidf_df = pd.DataFrame(soup_tfidf_array)
    return soup_tfidf_df

In [None]:
#soup_tfidf_df = tfidf_vectorize(tfidf_bow)

# Set Features for Model


In [None]:
count_vec_features = ['genres', 'directors', 'authors', 'actors', 'production_company']
tfidf_vec_features = ['movie_info']
movie_titles =       ['movie_title']

In [None]:
def main():
    
    # Opens and reads CSV files
    critic_reviews, movies = open_csv_files()
    
    # Prints overview of Files
    print_orig_columns_na(critic_reviews, movies)
    
    # Drops duplicate values
    movies_no_dups = drop_movie_dups(movies, movie_titles)
    
    # Defines features for creating word-count matrix
    count_features = define_count_features(movies_no_dups, count_vec_features)
    
    # Text processing/cleaning for count matrix
    clean_features = clean_count_features(count_features)
    
    # Creates a 'Bag of Words' for each movie, this combines all inlcuded terms
    count_bow = create_count_bow(clean_features)
    
    # Creates the count matrix
    count_vector = count_vectorize(count_bow)
    
    # 

if __name__ == "__main__":
    main()