In [None]:
#import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

In [None]:
# read in cleaned CSV
filepath = Path('../massive.csv')
massive = pd.read_csv(filepath)
massive = massive.sample(n=15000)

In [None]:
massive.head(1)

In [None]:
# Dropping columns including audienceScore because it could unfairly help the model predict the delta. Leaving tomatoMeter to give the model a base of where the delta could be.
massive = massive.drop(columns=['id', 'reviewId', 'creationDate', 'isTopCritic', 'reviewState', 'audienceScore'])

In [None]:
massive.isnull().sum()

In [None]:
# Dropping duplicates
massive = massive.drop_duplicates(subset='reviewText', keep='first')
# Checking for duplicates
dup_df = massive.apply(lambda x: x.duplicated()).sum()
dup_df

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=massive, x='publicatioName')

In [None]:
sns.set(rc = {'figure.figsize':(6,3)})
sns.histplot(data=massive, x='criticName')

In [None]:
massive.hist(column='delta', bins=100)

In [None]:
# define a function to process text on the data
lemmatizer = WordNetLemmatizer()
import re
def process_text(text): 
    sw = set(stopwords.words('english')) 
    regex = re.compile("[^a-zA-Z ]") 
    re_clean = regex.sub('', text) 
    words = word_tokenize(re_clean) 
    lem = [lemmatizer.lemmatize(word) for word in words] 
    output = ' '.join([word.lower() for word in lem if word.lower() not in sw]) 
    return output

In [None]:
# use a lambda x function to apple process text on the whole column.
massive['reviewText'] = massive['reviewText'].apply(lambda x: process_text(x))

In [None]:
# tfidf vectorizer for sentiment model
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(massive['reviewText'])
y = massive['scoreSentiment']

In [None]:
# Split the data into training and testing sets for sentiment model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1] 
for learning_rate in learning_rates: 
    GBmodel = GradientBoostingClassifier(n_estimators=100, 
                                      learning_rate=learning_rate,
                                      max_features=2,
                                      max_depth=3,
                                      random_state=0)
    GBmodel.fit(X_train, y_train.ravel())
    print("Learning rate: ", learning_rate)
    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        GBmodel.score(
            X_train,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        GBmodel.score(
            X_test,
            y_test.ravel())))
    print()

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
# Creating a dataframe made of the vectorized review text for the linear model
massive_features = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [None]:
massive_features.head(2)

In [None]:
# Need to rename column because title is also a column in massive_features
massive = massive.rename(columns={'title': 'title_'})

In [None]:
# Vectorizing original df to dense array for linear model
tfidf_dense = tfidf_vectorizer.fit_transform(massive['reviewText']).todense()

# Naming columns for each vectorized word and combining with original massive dataframe
new_cols = tfidf_vectorizer.get_feature_names()
combined = massive.join(pd.DataFrame(tfidf_dense, columns=new_cols))

In [None]:
# Dropping review text now that vectorized words are all columns
combined = combined.drop(columns=['reviewText'])

In [None]:
# Fill the nulls in all of the word columns with 0
combined = combined.fillna(0)

In [None]:
combined.isnull().sum().head(9)

In [None]:
# Renaming the least-appearing critics as Other to be encoded as the same critic later
counts = combined.criticName.value_counts()
threshold = combined.criticName.isin(counts.index[counts<16])
combined.loc[threshold, 'criticName'] = 'Other'
combined['criticName'].value_counts()

In [None]:
# Repeat for publications
counts = combined.publicatioName.value_counts()
threshold = combined.publicatioName.isin(counts.index[counts<12])
combined.loc[threshold, 'publicatioName'] = 'Other'
combined['publicatioName'].value_counts()

In [None]:
combined.head(1)

In [None]:
# list of columns to dummy
categorical_cols = ['title_', 'criticName', 'publicatioName', 'scoreSentiment'] 
# get dummies on categorical columns
combined = pd.get_dummies(combined, columns = categorical_cols)

In [None]:
X = combined.drop(columns=['delta'])
y = combined['delta']

In [None]:
X.isnull().sum().sum()

In [None]:
y.isnull().sum()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.isnull().sum().sum()

In [None]:
# Scaling X
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
pca = PCA(n_components=500)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

In [None]:
# Choose a machine learning model (e.g., Logistic Regression) and train it
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1] 
for learning_rate in learning_rates: 
    GBmodel = GradientBoostingClassifier(n_estimators=100, 
                                      learning_rate=learning_rate,
                                      max_features=2,
                                      max_depth=3,
                                      random_state=69)
    GBmodel.fit(X_train_p, y_train.ravel())
    print("Learning rate: ", learning_rate)
    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        GBmodel.score(
            X_train_p,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        GBmodel.score(
            X_test_p,
            y_test.ravel())))
    print()

In [None]:
model.score(X_train_p, y_train)

In [None]:
model.score(X_test_p, y_test)

In [None]:
massive_full = pd.read_csv(filepath)

In [None]:
massive_full = massive_full.sort_values(by='delta')
massive_full_20 = massive_full.head(20)

In [None]:
massive_full_sort = massive_full.sort_values(by='delta')
massive_full_asc_20 = massive_full_sort.head(20)
massive_full_desc_20 = massive_full_sort.tail(20)
massive_outliers = pd.concat([massive_full_asc_20, massive_full_desc_20], ignore_index=True)

In [14]:
massive.head(1)

Unnamed: 0,id,title,audienceScore,tomatoMeter,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,delta
362289,duplicity_2009,Duplicity,37.0,65.0,1803847,2009-03-23,Peter Rainer,True,rotten,Christian Science Monitor,"For all the glam and swank, the film is essent...",NEGATIVE,28.0


In [15]:
# Create a list of titles from the massive DataFrame
existing_titles = massive['title'].tolist()

# Use the isin() function to check if titles in massive_outliers exist in massive
mask = massive_outliers['title'].isin(existing_titles)

# Filter massive_outliers to keep only rows that don't exist in massive
massive_outliers = massive_outliers[~mask]

In [16]:
massive_outliers

Unnamed: 0,id,title,audienceScore,tomatoMeter,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,delta
0,against_the_night,Against the Night,96.0,9.0,2467190,2018-03-25,Rob Hunter,False,fresh,Film School Rejects,A fun little horror movie making due on a clea...,POSITIVE,-87.0
1,finding_fatimah,Finding Fatimah,99.0,13.0,2389567,2017-04-19,Tim Coleman,False,fresh,Total Film,"There's real chemistry between the leads, and ...",POSITIVE,-86.0
2,bad_kids_of_crestview_academy_2017,Bad Kids of Crestview Academy,96.0,10.0,2633279,2019-10-07,Joey Keogh,False,fresh,Wicked Horror,"Bad Kids Of Crestview Academy is bright, obnox...",POSITIVE,-86.0
3,the_tutor_2023,The Tutor,92.0,7.0,102791069,2023-03-22,Julian Roman,False,fresh,MovieWeb,The Tutor suffers from major logic flaws&#46; ...,POSITIVE,-85.0
4,the_tutor_2023,The Tutor,92.0,7.0,102792039,2023-03-26,Jackie K. Cooper,False,fresh,jackiekcooper.com,Twisty enough to keep you interested till the ...,POSITIVE,-85.0
5,the_tutor_2023,The Tutor,92.0,7.0,102792473,2023-03-28,Joe Lipsett,False,fresh,Bloody Disgusting,It&#8217;s not the evocative queer suspense fi...,POSITIVE,-85.0
6,the_tutor_2023,The Tutor,92.0,7.0,102793721,2023-03-31,Tony Medley,False,fresh,The Larchmont Chronicle,This is a terrific little thriller that is inv...,POSITIVE,-85.0
7,the_tutor_2023,The Tutor,92.0,7.0,102792424,2023-03-28,Keith Garlington,False,fresh,Arkansas Democrat-Gazette,For those able to avoid the traps of over-crit...,POSITIVE,-85.0
8,mrs_chatterjee_vs_norway,Mrs. Chatterjee vs Norway,93.0,9.0,102789131,2023-03-16,Renuka Vyavahare,False,fresh,The Times of India,Amit Trivedi’s music embodies the spirit of a ...,POSITIVE,-84.0
9,redeeming_love,Redeeming Love,95.0,11.0,2853307,2022-01-20,Randy Myers,True,fresh,San Jose Mercury News,"""Love"" might be best embraced by the faithful,...",POSITIVE,-84.0
