In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# read in small imdb dataset
df = pd.read_csv('imdb_dataset_small.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [4]:
len(df)

5000

In [5]:
# Initialize the spaCy English model
nlp = spacy.load("en_core_web_lg")

In [16]:
# Function to preprocess text using spaCy
def preprocess(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(cleaned_tokens)

In [12]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

Note: the next step might take some time to complete on the full dataset

In [13]:
# Create a CountVectorizer with and without preprocessing
vectorizer_unprocessed = CountVectorizer()
X_train_unprocessed = vectorizer_unprocessed.fit_transform(X_train)
X_test_unprocessed = vectorizer_unprocessed.transform(X_test)

vectorizer_processed = CountVectorizer(preprocessor=preprocess)
X_train_processed = vectorizer_processed.fit_transform(X_train)
X_test_processed = vectorizer_processed.transform(X_test)

In [14]:
# Train a Naive Bayes classifier on the unprocessed data
clf_unprocessed = MultinomialNB()
clf_unprocessed.fit(X_train_unprocessed, y_train)
y_pred_unprocessed = clf_unprocessed.predict(X_test_unprocessed)

print("Results without preprocessing:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_unprocessed)}")
print(classification_report(y_test, y_pred_unprocessed))

Results without preprocessing:
Accuracy: 0.82
              precision    recall  f1-score   support

    negative       0.81      0.85      0.83       506
    positive       0.84      0.79      0.81       494

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [15]:
# Train a Naive Bayes classifier on the preprocessed data
clf_processed = MultinomialNB()
clf_processed.fit(X_train_processed, y_train)
y_pred_processed = clf_processed.predict(X_test_processed)

print("\nResults with preprocessing:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_processed)}")
print(classification_report(y_test, y_pred_processed))


Results with preprocessing:
Accuracy: 0.819
              precision    recall  f1-score   support

    negative       0.80      0.85      0.83       506
    positive       0.84      0.79      0.81       494

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000

