In [13]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression

In [48]:
# read in cleaned CSV
filepath = Path('massive.csv')
massive = pd.read_csv(filepath)
massive = massive.sample(n=1000)

In [49]:
# define a function to process text on the data
lemmatizer = WordNetLemmatizer()
import re
def process_text(text): 
    sw = set(stopwords.words('english')) 
    regex = re.compile("[^a-zA-Z ]") 
    re_clean = regex.sub('', text) 
    words = word_tokenize(re_clean) 
    lem = [lemmatizer.lemmatize(word) for word in words] 
    output = ' '.join([word.lower() for word in lem if word.lower() not in sw]) 
    return output

In [50]:
# use a lambda x function to apple process text on the whole column.
massive['reviewText'] = massive['reviewText'].apply(lambda x: process_text(x))

In [51]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(massive['reviewText'])
y = massive['scoreSentiment']

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Choose a machine learning model (e.g., Logistic Regression) and train it
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [53]:
model.score(X_train, y_train)

0.744

In [54]:
model.score(X_test, y_test)

0.692

In [55]:
massive_features = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())



In [59]:
combined = pd.concat([massive, massive_features])

In [60]:
combined.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,...,year,yes,yet,york,youd,youll,young,younger,youre,youve
139125,vaxxed_from_cover_up_to_catastrophe_2016,Vaxxed: From Cover-Up to Catastrophe,84.0,38.0,2324052.0,2016-04-29,Mick LaSalle,True,fresh,San Francisco Chronicle,...,,,,,,,,,,
520634,harts_war,Hart's War,48.0,60.0,809945.0,2002-11-07,James Rocchi,False,fresh,Netflix,...,,,,,,,,,,
548346,million_dollar_arm,Million Dollar Arm,68.0,64.0,2203414.0,2014-05-16,Travis Hopson,False,rotten,Examiner.com,...,,,,,,,,,,
227293,there_will_be_blood,There Will Be Blood,86.0,91.0,1692485.0,2007-11-28,Joshua Tyler,False,fresh,CinemaBlend,...,,,,,,,,,,
97683,hobo_with_a_shotgun,Hobo With a Shotgun,57.0,66.0,1988815.0,2011-05-30,Kelly Vance,False,fresh,East Bay Express,...,,,,,,,,,,
