## 1. Imports

In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import string

# gradient boosting for classification in scikit-learn
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
stop_words = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

import matplotlib.pyplot as plt
import seaborn as sns


import concurrent.futures
import time
import threading


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
import xgboost
from sklearn import svm
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing


In [10]:
# load excel
data = pd.read_csv("/Users/hansangjun/Desktop/Projects/Capstone_Project_2/Data/IMDB_Dataset.csv")

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [12]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

Categorical variable has been equally distributed.

In [13]:
# replace 1 and 0
data.sentiment = data.sentiment.replace({'positive': 1, 'negative': 0})

In [14]:
data['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

## 2. Subset Dataset

I am going to subset the dataset to efficiently run the machine learning model.

In [15]:
df = data.sample(n=10000, random_state=123)
df['sentiment'].value_counts()

1    5021
0    4979
Name: sentiment, dtype: int64

## 3. Before Cleaning Dataset

### 3-1. CountVectorizor

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.25, random_state=1000)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)
X_train

<7500x46228 sparse matrix of type '<class 'numpy.int64'>'
	with 1016334 stored elements in Compressed Sparse Row format>

In [18]:
X_train.shape

(7500, 46228)

In [20]:
classifiers = [
    KNeighborsClassifier(3),
    RandomForestClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    naive_bayes.MultinomialNB()
    ]

In [21]:
for clf in classifiers:
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("model name: {}".format(clf))
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    print("--- %s seconds ---" % (time.time() - start_time))
    print("="*40)

model name: KNeighborsClassifier(n_neighbors=3)
Accuracy: 60.7600%
Log Loss: 4.177093981246377
--- 3.2576897144317627 seconds ---
model name: RandomForestClassifier()
Accuracy: 83.8400%
Log Loss: 0.5038459155699745
--- 7.72878623008728 seconds ---
model name: XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
Accuracy: 84.2400%
Log Los

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 3-2 TFIDF

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.25, random_state=1000)

In [23]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [24]:
for clf in classifiers:
    start_time = time.time()
    clf.fit(tfidf_train_vectors, y_train)
    train_predictions = clf.predict(tfidf_test_vectors)
    acc = accuracy_score(y_test, train_predictions)
    print("model name: {}".format(clf))
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(tfidf_test_vectors)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    print("--- %s seconds ---" % (time.time() - start_time))
    print("="*40)

model name: KNeighborsClassifier(n_neighbors=3)
Accuracy: 68.0000%
Log Loss: 3.1417675313828215
--- 3.2920122146606445 seconds ---
model name: RandomForestClassifier()
Accuracy: 83.0400%
Log Loss: 0.5166687142492233
--- 7.693850994110107 seconds ---
model name: XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
Accuracy: 83.0000%
Log L

## 4. After Cleaning Dataset

In [25]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) # Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) # Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = [lemmatizer.lemmatize(word) for word in text]
    
    text = " ".join(text) #removing stopwords
        
    return text

In [26]:
import time
start_time = time.time()
df['review'] = df['review'].apply(lambda x: clean_text(x))
print("--- %s seconds ---" % (time.time() - start_time))
df.head()

--- 11.170544147491455 seconds ---


Unnamed: 0,review,sentiment
11872,"movie beyond awful, pimple movie industry know...",0
40828,writing john carpenter halloween nearing th an...,1
36400,must admit slight disappointment film read lot...,1
5166,oh dear bbc knocked pedestal absorbing period ...,0
30273,totally average film semi alright action seque...,0


### 4-1. CountVectorizer

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.25, random_state=1000)

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)

In [28]:
for clf in classifiers:
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("model name: {}".format(clf))
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    print("--- %s seconds ---" % (time.time() - start_time))
    print("="*40)

model name: KNeighborsClassifier(n_neighbors=3)
Accuracy: 57.8800%
Log Loss: 5.465913433661351
--- 2.382977247238159 seconds ---
model name: RandomForestClassifier()
Accuracy: 84.8400%
Log Loss: 0.4819815269797356
--- 7.414118051528931 seconds ---
model name: XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
Accuracy: 82.2400%
Log Los

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 4-2 TFIDF

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.25, random_state=1000)

tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [30]:
for clf in classifiers:
    start_time = time.time()
    clf.fit(tfidf_train_vectors, y_train)
    train_predictions = clf.predict(tfidf_test_vectors)
    acc = accuracy_score(y_test, train_predictions)
    print("model name: {}".format(clf))
    print("Accuracy: {:.4%}".format(acc))

    train_predictions = clf.predict_proba(tfidf_test_vectors)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    print("--- %s seconds ---" % (time.time() - start_time))
    print("="*40)

model name: KNeighborsClassifier(n_neighbors=3)
Accuracy: 70.9200%
Log Loss: 3.1383864163576263
--- 2.303748846054077 seconds ---
model name: RandomForestClassifier()
Accuracy: 84.5600%
Log Loss: 0.4889608416785859
--- 6.98526406288147 seconds ---
model name: XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
Accuracy: 82.6400%
Log Los