In [1]:
#import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
import gensim
import multiprocessing
import os
from time import time
cores = multiprocessing.cpu_count()

In [2]:
# To notify when cell is complete (may need to comment out first line)
# !pip uninstall jupyternotify -y
!pip install git+https://github.com/cphyc/jupyter-notify.git
%reload_ext jupyternotify

Collecting git+https://github.com/cphyc/jupyter-notify.git
  Cloning https://github.com/cphyc/jupyter-notify.git to c:\users\athen\appdata\local\temp\pip-req-build-gt8j7ame
  Resolved https://github.com/cphyc/jupyter-notify.git to commit 8cff958cbd3f00f7e4eb59b457f9f915e2ddff37
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/cphyc/jupyter-notify.git 'C:\Users\Athen\AppData\Local\Temp\pip-req-build-gt8j7ame'


<IPython.core.display.Javascript object>

In [16]:
# read in cleaned CSV
filepath = Path('../massive.csv')
massive = pd.read_csv(filepath)
massive = massive.sample(n=15000)

In [17]:
massive.head(1)

Unnamed: 0,id,title,audienceScore,tomatoMeter,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,scoreSentiment,delta
747149,the_rocket,The Rocket,83.0,95.0,2160564,2013-08-25,Ed Gibbs,False,fresh,The Sunday Age,"A hugely enjoyable, thoroughly infectious feat...",POSITIVE,12.0


In [18]:
# Dropping columns including audienceScore because it could unfairly help the model predict the delta. Leaving tomatoMeter to give the model a base of where the delta could be.
massive = massive.drop(columns=['id', 'reviewId', 'creationDate', 'isTopCritic', 'reviewState', 'audienceScore'])

In [19]:
massive.isnull().sum()

title             0
tomatoMeter       0
criticName        0
publicatioName    0
reviewText        0
scoreSentiment    0
delta             0
dtype: int64

In [20]:
# Dropping duplicates
massive = massive.drop_duplicates(subset='reviewText', keep='first')
# Checking for duplicates
dup_df = massive.apply(lambda x: x.duplicated()).sum()
dup_df

title              7266
tomatoMeter       14889
criticName        11876
publicatioName    13530
reviewText            0
scoreSentiment    14987
delta             14868
dtype: int64

In [None]:
# Tokenize and encode the text data using BERT tokenizer
encoded_data = [tokenizer.encode(text, add_special_tokens=True) for text in massive['reviewText']]

# Pad sequences to the same length (optional but recommended)
max_len = max([len(seq) for seq in encoded_data])
padded_data = [seq + [0] * (max_len - len(seq)) for seq in encoded_data]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_data)

NameError: name 'tokenizer' is not defined

In [None]:
%%notify
# Forward pass through the BERT model to get embeddings
with torch.no_grad():
    embeddings = model(input_ids)[0]  # The [0] index gets the embeddings from the model output

In [None]:
# Creating a dataframe made of the vectorized review text for the linear model
# massive_features = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [None]:
# Dropping review text now that vectorized words are all columns
combined = combined.drop(columns=['reviewText'])

NameError: name 'combined' is not defined

In [238]:
# Fill the nulls in all of the word columns with 0
combined = combined.fillna(0)

In [None]:
combined.isnull().sum().head(9)

title_            0
tomatoMeter       0
criticName        0
publicatioName    0
scoreSentiment    0
delta             0
abandon           0
ability           0
able              0
dtype: int64

In [None]:
# Renaming the least-appearing critics as Other to be encoded as the same critic later
counts = combined.criticName.value_counts()
threshold = combined.criticName.isin(counts.index[counts<16])
combined.loc[threshold, 'criticName'] = 'Other'
combined['criticName'].value_counts()

Other              8954
Dennis Schwartz     111
David Nusair         94
Roger Ebert          89
Roger Moore          81
                   ... 
Ella Taylor          16
Scott Foundas        16
Dann Gire            16
David Noh            16
Kim Newman           16
Name: criticName, Length: 214, dtype: int64

In [None]:
# Repeat for publications
counts = combined.publicatioName.value_counts()
threshold = combined.publicatioName.isin(counts.index[counts<12])
combined.loc[threshold, 'publicatioName'] = 'Other'
combined['publicatioName'].value_counts()

Other                 4148
New York Times         180
Variety                166
Los Angeles Times      138
Slant Magazine         134
                      ... 
The Ringer              12
Three Movie Buffs       12
Film Companion          12
Baltimore Sun           12
culturevulture.net      12
Name: publicatioName, Length: 333, dtype: int64

In [271]:
combined.head(1)

Unnamed: 0,tomatoMeter,delta,abandon,ability,able,abrams,absence,absolute,absolutely,absorbing,...,publicatioName_culturevulture.net,publicatioName_eFilmCritic.com,publicatioName_eye WEEKLY,publicatioName_film-authority.com,publicatioName_indieWire,publicatioName_jackiekcooper.com,publicatioName_rachelsreviews.net,publicatioName_rec.arts.movies.reviews,scoreSentiment_NEGATIVE,scoreSentiment_POSITIVE
328556,22.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# list of columns to dummy
categorical_cols = ['title_', 'criticName', 'publicatioName', 'scoreSentiment'] 
# get dummies on categorical columns
combined = pd.get_dummies(combined, columns = categorical_cols)

In [273]:
X = combined.drop(columns=['delta'])
y = combined['delta']

In [274]:
X.isnull().sum().sum()

0

In [284]:
y[:5]

328556   -10.0
477873    19.0
709010     9.0
340594     8.0
720402   -31.0
Name: delta, dtype: float64

In [292]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [293]:
X_train.isnull().sum().sum()

0

In [283]:
# Scaling X
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [339]:
pca = PCA(n_components=2000)
X_train_p = pca.fit_transform(X_train_sc)
X_test_p = pca.transform(X_test_sc)

In [340]:
# Choose a machine learning model (e.g., Logistic Regression) and train it
model = LinearRegression()
model.fit(X_train_p, y_train)

LinearRegression()

In [341]:
model.score(X_train_p, y_train)

0.5509262446941581

In [342]:
model.score(X_test_p, y_test)

0.3442473356373843