# Experiment with full dataset

In order to run this project you will need following python libraries:

* numpy 
* scipy
* pandas
* sklearn

In [1]:
%pylab inline
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

Populating the interactive namespace from numpy and matplotlib


In [2]:
file_name='IMDB Dataset.csv'
seperator = ','
X_clm_name='review'
y_clm_name='sentiment'

In [3]:
max_features = 4000
ngram_range = (1,3)
stopwords = None
random_seed = 123

In [4]:
df = pd.read_csv(file_name, sep=seperator)
df = df[df[X_clm_name] != "none"]
df = df[df[X_clm_name] != ""]
df = df[df[X_clm_name].notna()]
df = df[df[y_clm_name] != "none"]
df = df[df[y_clm_name] != ""]
df = df[df[y_clm_name].notna()]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
#df = df[df[X_clm_name].str.len() > 200]

In [6]:
df[y_clm_name].unique()

KeyError: 'Sentiment'

In [None]:
df[y_clm_name] = df.apply(lambda row: 1 
                           if row[y_clm_name] == "positive" 
                           else 0, axis=1
                          )

In [None]:
df[y_clm_name].unique()

In [None]:
positives = df[df[y_clm_name] == 1]
positives.count()[0]

In [None]:
negatives = df[df[y_clm_name] == 0]
negatives.count()[0]

In [None]:
n_samples = (negatives.count()[0] 
             if negatives.count()[0] < positives.count()[0] 
             else positives.count()[0]
            )
n_samples

In [None]:
positives = resample(positives, n_samples=n_samples, random_state=random_seed)
positives.count()[0]

In [None]:
negatives = resample(negatives, n_samples=n_samples, random_state=random_seed)
negatives.count()[0]

In [None]:
df = negatives.append(positives)
df.count()[0]

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)
X_raw = df[X_clm_name]
y = df[y_clm_name]
def do_cross_val(model, X):
    acc_score = cross_val_score(
        model, 
        X, 
        y, 
        cv=kf, 
        scoring='accuracy'
    )
    #print('Accuracies:')
    #print(acc_score)
    print('Mean accuracy:')
    print(np.mean(acc_score))

In [None]:
tfidf = TfidfVectorizer(
    lowercase=True, 
    stop_words=stopwords, 
    ngram_range=ngram_range,
    max_features=max_features
)

start_time = time.time()

X_tfidf = tfidf.fit_transform(X_raw)

run_time = (time.time() - start_time)
print(f'time taken: {run_time} seconds')

In [None]:
cv = CountVectorizer(
    lowercase=True, 
    stop_words=stopwords, 
    ngram_range=ngram_range,
    max_features=max_features
)
start_time = time.time()

X_cv = cv.fit_transform(X_raw)

run_time = (time.time() - start_time)
print(f'time taken: {run_time} seconds')

In [None]:
def do_cross_val_with_each_X(model):
    start_time = time.time()
    print("cv")
    do_cross_val(model, X_cv)
    run_time = (time.time() - start_time)
    print(f'time taken: {run_time} seconds')
    
    start_time = time.time()
    print("tfidf")
    do_cross_val(model, X_tfidf)
    run_time = (time.time() - start_time)
    print(f'time taken: {run_time} seconds\n')

In [None]:
lr = make_pipeline(
    StandardScaler(with_mean=False), 
    LogisticRegression(random_state=random_seed, solver='liblinear', C=1))
do_cross_val_with_each_X(lr)

In [None]:
mnb = MultinomialNB()
do_cross_val_with_each_X(mnb)