In [141]:
# Imports

import numpy as np
import pandas as pd

## Load a dataset

In [142]:
df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/NLP/moviereviews.tsv', sep = '\t')
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [143]:
# Number of reviews

len(df)

2000

In [144]:
# Sample reviews

print(df['review'][0])

how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story alternates betwee

In [145]:
# Sample reviews

print(df['review'][1])

some talented actresses are blessed with a demonstrated wide acting range while others , almost as gifted , have more limited types of parts for which they are suitable . 
as was amply evident after basic instinct , sharon stone can play sensual roles with great abandon . 
rejecting her natural abilities , she has spent the rest of her entire career trying with little success to play against type . 
gloria is her latest disaster . 
babe ruth didn't quit baseball after one season to play football in a quixotic quest to prove his athletic dexterity , and neither should stone reject what she does best . 
janeane garofalo , for example , is no less wonderful an actress because she could have never pulled off stone's part in basic instinct ; neither is stone any less talented because she couldn't do garofalo's comedic roles . 
gloria , directed by respected director sidney lumet and adapted by steve antin from the 1980 screenplay by john cassavetes , was not screened in advance for critics 

In [146]:
# Check for missing values

df.isnull().sum()

Unnamed: 0,0
label,0
review,35


In [147]:
# Remove NaN reviews

df.dropna(inplace = True)

In [148]:
# Check again

df.isnull().sum()

Unnamed: 0,0
label,0
review,0


In [149]:
len(df)

1965

In [150]:
# How about blank reviews?

blanks = []

for i, label, reviews in df.itertuples():

  if reviews.isspace():

    blanks.append(i)

blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [151]:
# Remove blanks based from index

df.drop(blanks, inplace = True)
len(df)

1938

In [152]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [153]:
# Pipeline for Vectorization

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])
text_clf.fit(X_train, y_train)

In [154]:
# Predictions

predictions = text_clf.predict(X_test)
predictions

array(['neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg',
       'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos',
       'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg',
       'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos',
       'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg',
       'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos',
       'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg',
       'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos',
       'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos',
       'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos',
       'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos',
       'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos',
       'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos',
       'pos', 'pos',

In [155]:
# Metrics

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[235  47]
 [ 41 259]]
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

0.8487972508591065


In [156]:
# Sample Prediction

text_clf.predict(['The movie is bad as hell'])

array(['neg'], dtype=object)