## Setup

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, LinearRegression, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Load Dataset
Train / Test = 8 / 2

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].replace("positive", 1)
df['sentiment'] = df['sentiment'].replace("negative", 0)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
print(df_train.info())
df_train

In [None]:
print(df_test.info())
df_test

## Metrics

In [None]:
def get_metrics(y_test, y_pred_proba):
    print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba >= 0.5), 4))
    print('F1_SCORE: ', round(f1_score(y_test, y_pred_proba >= 0.5), 4))
    print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4))
    print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba >= 0.5),'\n')

## Creat model

In [None]:
tfidf = TfidfVectorizer(max_features=100000, ngram_range=(1,2))

In [None]:
X_train = tfidf.fit_transform(df_train['review'])
X_test = tfidf.transform(df_test['review'])
y_train = df_train['sentiment']
y_test = df_test['sentiment']
print(X_train.shape)
print(X_test.shape)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test) [:,1]
get_metrics(y_test, y_pred_proba)

## Table Scores

In [None]:
model = [LogisticRegression(), MultinomialNB(), DecisionTreeClassifier(), 
         RandomForestClassifier(), LGBMClassifier(), XGBClassifier()] 

In [None]:
data = {'MODEL': [0], 'ACCURACY_SCORE': [0], 'ROC_AUC_SCORE': [0],}
score = pd.DataFrame(data)

for i in range (0,len(model)):
    print( model[i].__class__.__name__,".....")
    model[i].fit(X_train, y_train)
    y_pred_proba = model[i].predict_proba(X_test) [:,1]
    new_row = {'MODEL': model[i].__class__.__name__, 
               'ACCURACY_SCORE': round(accuracy_score(y_test, y_pred_proba >= 0.5), 4), 
               'ROC_AUC_SCORE': round(roc_auc_score(y_test, y_pred_proba), 4)}
    score = score.append(new_row, ignore_index = True)

In [None]:
score.head(10)