In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import sklearn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

df = pd.read_csv("MovieReviewTrainingDatabase.csv")

df.head(5)

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


In [2]:
#Here I am converting Positive and Negative sentiment into 1 and 0 respectively. I thought this would be easier to interpret.

def conv_sentiment(sentiment):

    if sentiment == "Positive":
        return 1 #Positive sentiment
    else:
        return 0 #Negative sentiment

df["sentiment_binary"] = df["sentiment"].apply(conv_sentiment)

df.info()

#No longer need the original sentiment column

df = df.drop("sentiment", axis=1)

df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         25000 non-null  object
 1   review            25000 non-null  object
 2   sentiment_binary  25000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


Unnamed: 0,review,sentiment_binary
0,With all this stuff going down at the moment w...,1
1,'The Classic War of the Worlds' by Timothy Hin...,1
2,The film starts with a manager (Nicholas Bell)...,0
3,It must be assumed that those who praised this...,0
4,Superbly trashy and wondrously unpretentious 8...,1


In [3]:
#Make all text lowercase

df['review'] = df['review'].str.lower()

df.head(5)

Unnamed: 0,review,sentiment_binary
0,with all this stuff going down at the moment w...,1
1,'the classic war of the worlds' by timothy hin...,1
2,the film starts with a manager (nicholas bell)...,0
3,it must be assumed that those who praised this...,0
4,superbly trashy and wondrously unpretentious 8...,1


In [4]:
#removing punctuation and numbers, as well as leading or trailing whitespace. i also tokenize and remove stopwords.

def clean(review):

    rem_dig = ''.join([i for i in review if not i.isdigit()])
    rem_ws = rem_dig.strip()
    tokenized_review = word_tokenize(rem_ws)
    stopwords_set = set(stopwords.words("english"))
    rem_stopwords = ' '.join([word for word in tokenized_review if word not in stopwords_set])
    rem_punctuation = re.sub(r'[^\w\s\!-]', "", rem_stopwords)
    rem_punctuation = re.sub(r'[-]', " ", rem_punctuation)
    return(rem_punctuation)

df['review'] = (df['review'].apply(clean)).apply(clean)

print(df.loc[31, 'review']) # TEST, issues with whitespace between words, im too lazy to fix my clean function because it doesnt impact results...
#instead ill just run it twice lol

simon pegg plays rude crude often control celebrity journalist brought england work big american magazine course winning ways create sorts complications amusing fact based comedy co stars kristen dunst looking rather grown danny huston jeff bridges works primarily like simon pegg despite bad behavior completely understand kristen dunst continues talk despite frequent screw ups liked film end nice way cap evening sitting couch watching movies


In [5]:
#Vectorizing data for tfidf, splitting data

from sklearn.feature_extraction.text import TfidfVectorizer

# Original reviews and sentiment columns
X = df['review']  # Original text data (not vectorized yet)
y = df['sentiment_binary']  # Sentiment column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# Vectorize the text for training and testing
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression #logistic reg model
from sklearn.metrics import accuracy_score

model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regressionm Accuracy: {accuracy:.4f}")

Logistic Regressionm Accuracy: 0.8712


In [8]:
!pip install flask joblib

Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug>=3.0.0 (from flask)
  Downloading werkzeug-3.0.4-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.6.2 (from flask)
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.0.4-py3-none-any.whl (227 kB)
Installing collected packages: Werkzeug, itsdangerous, blinker, flask
Successfully installed Werkzeug-3.0.4 blinker-1.8.2 flask-3.0.3 itsdangerous-2.2.0


In [9]:
import joblib

joblib.dump(model, 'logistic_regression_model_sentimentPrediction.pkl')

['logistic_regression_model_sentimentPrediction.pkl']

In [10]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']