In [1]:
import numpy as np
import nltk
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df["sentiment"] = df['sentiment'].map({'positive': True, 'negative': False})
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,True
1,A wonderful little production. <br /><br />The...,True
2,I thought this was a wonderful way to spend ti...,True
3,Basically there's a family where a little boy ...,False
4,"Petter Mattei's ""Love in the Time of Money"" is...",True
...,...,...
49995,I thought this movie did a down right good job...,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False
49997,I am a Catholic taught in parochial elementary...,False
49998,I'm going to have to disagree with the previou...,False


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_df = 0.2, min_df = 30, stop_words = 'english')
# max_df=0.2: Ignores terms that appear in more than 20% of the documents. This helps in excluding very frequent words that might not be as informative.
# min_df=30: Ignores terms that appear in fewer than 30 documents. This helps in excluding very rare words that might be noise.
counts = vec.fit_transform(df['review'])
count_df = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names())
df = pd.concat((df, count_df), axis = 1)
df

Unnamed: 0,review,sentiment,00,000,007,01,02,06,10,100,...,zhang,zip,zoe,zombie,zombies,zone,zoo,zoom,zooms,zucker
0,One of the other reviewers has mentioned that ...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A wonderful little production. <br /><br />The...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,I thought this was a wonderful way to spend ti...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Basically there's a family where a little boy ...,False,0,0,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,I am a Catholic taught in parochial elementary...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,I'm going to have to disagree with the previou...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=10)  # Set the number of components as needed
X_pca = pca.fit_transform(count_df)
df_pca = pd.concat((df, pd.DataFrame(X_pca)), axis=1)
df_pca

Unnamed: 0,review,sentiment,00,000,007,01,02,06,10,100,...,0,1,2,3,4,5,6,7,8,9
0,One of the other reviewers has mentioned that ...,True,0,0,0,0,0,0,0,0,...,0.635421,-0.853424,-0.748399,-0.229311,-0.493574,-0.408238,-0.127043,-0.225557,0.204506,0.192959
1,A wonderful little production. <br /><br />The...,True,0,0,0,0,0,0,0,0,...,-0.380733,0.385188,0.129295,0.005605,0.265840,-0.222088,-0.115685,-0.210122,0.211254,-1.002894
2,I thought this was a wonderful way to spend ti...,True,0,0,0,0,0,0,0,0,...,-0.573722,0.109908,-0.379651,-0.100513,-0.323901,0.723994,0.565901,0.375432,-0.335119,0.136415
3,Basically there's a family where a little boy ...,False,0,0,0,0,0,0,1,0,...,-0.934552,0.479360,-0.408158,-0.406340,0.120504,-0.364700,-0.380880,-0.396589,-0.299879,-0.877897
4,"Petter Mattei's ""Love in the Time of Money"" is...",True,0,0,0,0,0,0,0,0,...,0.257463,0.859553,0.109625,0.206001,-0.580174,-0.017826,0.140489,0.007432,0.527191,-0.010838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,True,0,0,0,0,0,0,1,0,...,-0.479892,-0.420533,-0.774998,-0.012405,-0.665176,-0.045039,-0.674149,0.243496,-0.224960,-0.200972
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False,0,0,0,0,0,0,0,0,...,-1.156799,-0.414914,0.027790,0.216556,0.359469,0.118673,0.082458,-0.143856,0.148203,0.264552
49997,I am a Catholic taught in parochial elementary...,False,0,0,0,0,0,0,0,0,...,-0.428613,1.458569,-0.804100,-0.524476,-0.440408,0.317250,-0.752774,-0.402811,-0.405850,-0.792014
49998,I'm going to have to disagree with the previou...,False,0,0,0,0,0,0,0,0,...,-0.649118,0.100880,-0.065424,-0.092037,0.412183,-0.095006,-0.330232,-0.195344,0.670670,0.033390


In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_pca, test_size=0.4)
X_train = train.drop(['review', 'sentiment'], axis=1)
y_train = train['sentiment']
X_test = test.drop(['review', 'sentiment'], axis=1)
y_test = test['sentiment']
X_train

Unnamed: 0,00,000,007,01,02,06,10,100,1000,101,...,0,1,2,3,4,5,6,7,8,9
13598,0,0,0,0,0,0,0,0,0,0,...,0.385071,0.542953,1.512732,0.154129,-0.188268,0.439644,-1.507501,-0.498220,-0.846876,0.182477
40614,0,0,0,0,0,0,1,0,0,0,...,0.429836,0.341970,-0.261764,-0.735616,0.561079,-0.525254,0.511080,1.006736,0.041915,0.725436
33492,0,0,0,0,0,0,0,0,0,0,...,-0.864165,1.007668,0.497966,0.244962,-0.081936,-0.488449,-0.016798,0.153533,0.438772,-0.160405
45991,0,0,0,0,0,0,0,0,0,0,...,1.762471,1.908704,0.741262,2.357170,2.577413,1.834209,-1.238296,-0.798319,-1.442833,-1.067492
3690,0,0,0,0,0,0,0,0,0,0,...,-0.672461,-0.081764,-0.128456,0.338169,0.916939,0.224062,-0.364265,-0.786725,-0.525025,0.117749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26291,0,1,0,0,0,0,0,0,0,0,...,-0.118540,0.838628,-0.538859,0.193936,-0.994691,2.132356,1.768296,0.825949,-0.488849,-0.516641
33707,0,0,0,0,0,0,0,0,0,0,...,-1.357251,-0.487542,0.426915,-0.571429,0.121564,-0.042870,0.220803,-0.682841,-0.472240,-0.511121
17846,0,0,0,0,0,0,0,0,0,0,...,0.133476,0.157054,-0.491273,-0.363755,0.697120,0.038657,-0.378792,-0.221312,-0.253756,-0.447680
42112,0,0,0,0,0,0,0,0,0,0,...,-0.933801,-0.287284,-0.405424,-0.033615,-0.486353,-0.268505,-0.234650,-0.095195,-0.078648,0.246875


In [7]:
y_train = y_train.iloc[:, 0]
y_test = y_test.iloc[:, 0]
y_train

13598     True
40614     True
33492     True
45991     True
3690     False
         ...  
26291     True
33707    False
17846    False
42112    False
47760    False
Name: sentiment, Length: 30000, dtype: bool

In [8]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter = 5000)

In [9]:
LR.fit(X_train, y_train)

LogisticRegression(max_iter=5000)

In [10]:
score = LR.score(X_test, y_test)
print(score)

0.8663
