In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [4]:
# for practice this much should be enough 
df = df.iloc[:10000]

In [5]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [6]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [7]:
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('', '', string.punctuation))

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

df['review'] = df['review'].str.lower() # you do not need in cv or tfidf bcs scikit-learn will do it for you but for practice we will do it here
df['review'] = df['review'].apply(remove_punctuation)
df['review'] = df['review'].apply(remove_stopwords)# you do not need in cv or tfidf bcs scikit-learn will do it for you but for practice we will do it here
df['review'] = df['review'].apply(stem_text)

[nltk_data] Downloading package stopwords to /home/prem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [9]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product br br film techniqu unass...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [11]:
df.duplicated().sum()

np.int64(17)

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product br br film techniqu unass...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1
...,...,...
9995,fun entertain movi wwii german spi juli andrew...,1
9996,give break anyon say good hockey movi know mov...,0
9997,movi bad movi watch endless seri bad horror mo...,0
9998,movi probabl made entertain middl school earli...,0


In [14]:
X = df['review']
y = df['sentiment']

In [15]:
X

0       one review mention watch 1 oz episod youll hoo...
1       wonder littl product br br film techniqu unass...
2       thought wonder way spend time hot summer weeke...
3       basic there famili littl boy jake think there ...
4       petter mattei love time money visual stun film...
                              ...                        
9995    fun entertain movi wwii german spi juli andrew...
9996    give break anyon say good hockey movi know mov...
9997    movi bad movi watch endless seri bad horror mo...
9998    movi probabl made entertain middl school earli...
9999    smash film filmmak show intens strang relation...
Name: review, Length: 9983, dtype: str

In [16]:
y

0       1
1       1
2       1
3       0
4       1
       ..
9995    1
9996    0
9997    0
9998    0
9999    1
Name: sentiment, Length: 9983, dtype: int64

In [17]:
# we have already done the label encoding but again for practice, i am gonna do it again

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [18]:
le.fit_transform(y)

array([1, 1, 1, ..., 0, 0, 1], shape=(9983,))

In [19]:
y

0       1
1       1
2       1
3       0
4       1
       ..
9995    1
9996    0
9997    0
9998    0
9999    1
Name: sentiment, Length: 9983, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.shape

(7986,)

In [22]:
# BoW (Bag of Words)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [23]:
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [24]:
X_train_bow.shape

(7986, 48748)

In [25]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [26]:
gnb.fit(X_train_bow, y_train)

0,1,2
,"priors  priors: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",
,"var_smoothing  var_smoothing: float, default=1e-9 Portion of the largest variance of all features that is added to variances for calculation stability. .. versionadded:: 0.20",1e-09


In [27]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix

In [28]:
accuracy_score(y_test, y_pred)

0.6464697045568353

In [29]:
confusion_matrix(y_test, y_pred)

array([[733, 252],
       [454, 558]])

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [31]:
rfc.fit(X_train_bow, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [34]:
y_pred = rfc.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8497746619929895

In [None]:
X_train_bow.shape # so here the feature/vocabulary size is 7986

(7986, 48748)

In [40]:
cv = CountVectorizer(max_features=2500)
X_train_bow_new = cv.fit_transform(X_train)
X_test_bow_new = cv.transform(X_test)

In [42]:
rfcc = RandomForestClassifier()

rfcc.fit(X_train_bow_new, y_train)
y_pred = rfcc.predict(X_test_bow_new)

In [43]:
accuracy_score(y_test, y_pred)

0.8447671507260891

In [45]:
# n grams

cv = CountVectorizer(ngram_range=(1, 2), max_features=3000)

X_train_bow_ngram = cv.fit_transform(X_train)
X_test_bow_ngram  =  cv.transform(X_test)

rf = RandomForestClassifier()

rf.fit(X_train_bow_ngram, y_train)
y_pred = rf.predict(X_test_bow_ngram)
accuracy_score(y_test, y_pred)

0.8442663995993991

In [46]:
# Tf-Idf 

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [48]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [49]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
accuracy_score(y_test, y_pred)

0.8517776664997496