In [1]:
import numpy as np
import nltk
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df["sentiment"] = df['sentiment'].map({'positive': True, 'negative': False})
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,True
1,A wonderful little production. <br /><br />The...,True
2,I thought this was a wonderful way to spend ti...,True
3,Basically there's a family where a little boy ...,False
4,"Petter Mattei's ""Love in the Time of Money"" is...",True
...,...,...
49995,I thought this movie did a down right good job...,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False
49997,I am a Catholic taught in parochial elementary...,False
49998,I'm going to have to disagree with the previou...,False


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_df = 0.2, min_df = 30, stop_words = 'english')
# max_df=0.2: Ignores terms that appear in more than 20% of the documents. This helps in excluding very frequent words that might not be as informative.
# min_df=30: Ignores terms that appear in fewer than 30 documents. This helps in excluding very rare words that might be noise.
counts = vec.fit_transform(df['review'])
count_df = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names())
df = pd.concat((df, count_df), axis = 1)
df

Unnamed: 0,review,sentiment,00,000,007,01,02,06,10,100,...,zhang,zip,zoe,zombie,zombies,zone,zoo,zoom,zooms,zucker
0,One of the other reviewers has mentioned that ...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A wonderful little production. <br /><br />The...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,I thought this was a wonderful way to spend ti...,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Basically there's a family where a little boy ...,False,0,0,0,0,0,0,1,0,...,0,0,0,2,0,0,0,0,0,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,I am a Catholic taught in parochial elementary...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,I'm going to have to disagree with the previou...,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.4)
X_train = train.drop(['review', 'sentiment'], axis=1) # get only the term-document matrix
y_train = train['sentiment']
X_test = test.drop(['review', 'sentiment'], axis=1) # get only the term-document matrix
y_test = test['sentiment']

X_train

Unnamed: 0,00,000,007,01,02,06,10,100,1000,101,...,zhang,zip,zoe,zombie,zombies,zone,zoo,zoom,zooms,zucker
11046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38901,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42013,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
34295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39480,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
y_train = y_train.iloc[:, 0]
y_test = y_test.iloc[:, 0]
y_train

11046    False
44724     True
38901     True
42013     True
49194    False
         ...  
16475     True
13279     True
34295    False
39480    False
7839      True
Name: sentiment, Length: 30000, dtype: bool

In [7]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter = 5000)

In [8]:
LR.fit(X_train, y_train)

LogisticRegression(max_iter=5000)

In [9]:
score = LR.score(X_test, y_test)
print(score)

0.8668
