# Metrics

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence 
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('/Users/pan/Desktop/train.csv')
test_df = pd.read_csv('/Users/pan/Desktop/test.csv')

In [3]:
train_X = train_df['Reviews'].tolist()
test_X = test_df['Reviews'].tolist()
train_labels = train_df['Label']
test_labels = test_df['Label']

In [4]:
def strip_html_tag(sample):
    cleanr = re.compile(r"\<[^\>]*\>|\\")
    return re.sub(cleanr, '', sample)

In [5]:
train_X = list(map(strip_html_tag, train_X))
test_X = list(map(strip_html_tag, test_X))

In [6]:
def tokenize_text(sample):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(sample)

In [7]:
train_X = list(map(tokenize_text,train_X))
test_X = list(map(tokenize_text,test_X))

In [8]:
def punctuation_remover(sample):
    sample_wostopword_wopunc = []
    for i in sample:
        if i not in string.punctuation:
            sample_wostopword_wopunc.append(i)
    return sample_wostopword_wopunc

In [9]:
train_X = list(map(punctuation_remover,train_X))
test_X = list(map(punctuation_remover,test_X))

In [14]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(train_X)

In [15]:
x_train = tokenizer.texts_to_matrix(train_X, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_X, mode='tfidf')

## ExtraTree

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

In [18]:
et_tf = ExtraTreesClassifier(n_estimators=100, random_state=0)
et_tf.fit(x_train, train_labels)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [19]:
y_pred_et_tf = et_tf.predict(x_test)

In [20]:
def conf_matrix(y_test,pred):
    conf_mat = confusion_matrix(y_test, pred)
    print(conf_mat)
    TN = conf_mat[0][0]
    FP = conf_mat[0][1]
    FN = conf_mat[1][0]
    TP = conf_mat[1][1]
    Accuracy = (TP+TN)/(TP+FP+TN+FN)
    Precision = TP/(TP+FP)
    Recall = TP/(TP+FN)
    f1_score = (2*Precision*Recall)/(Precision + Recall)
    result = pd.DataFrame({'Accuracy':[Accuracy], 'Precision':[Precision], 'Recall':[Recall], 'f1_score':[f1_score]})
    print(result)

In [21]:
conf_matrix(test_labels, y_pred_et_tf)

[[10800  1700]
 [ 1839 10661]]
   Accuracy  Precision   Recall  f1_score
0   0.85844   0.862471  0.85288  0.857649


## Adaboost with DT

In [26]:
ad_dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100)
ad_dt.fit(x_train, train_labels)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [27]:
y_pred_ad_dt = ad_dt.predict(x_test)

In [28]:
conf_matrix(test_labels, y_pred_ad_dt)

[[10444  2056]
 [ 1691 10809]]
   Accuracy  Precision   Recall  f1_score
0   0.85012   0.840187  0.86472  0.852277


## Gradient boosting 

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
gb = GradientBoostingClassifier(max_depth=3, n_estimators=50)
gb.fit(x_train, train_labels)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [31]:
y_pred_gb = gb.predict(x_test)

In [32]:
conf_matrix(test_labels, y_pred_gb)

[[ 8804  3696]
 [ 1784 10716]]
   Accuracy  Precision   Recall  f1_score
0    0.7808   0.743547  0.85728  0.796373


## XGboost

In [33]:
import xgboost as xgb

In [34]:
xgb = xgb.XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.1,
                                  subsample=.7, colsample_bytree=0.6, gamma=0.05)
xgb.fit(x_train, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.05,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=1)

In [35]:
y_pred_xgb = xgb.predict(x_test)

In [36]:
conf_matrix(test_labels, y_pred_xgb)

[[10805  1695]
 [ 1380 11120]]
   Accuracy  Precision  Recall  f1_score
0     0.877   0.867733  0.8896  0.878531


## Voting Classifier

In [38]:
from sklearn.ensemble import VotingClassifier

In [39]:
vc = VotingClassifier(estimators=[
    ("mnb_clf", MNB_tfidf_Classifier()),
    ("log_clf", LogisticRegression_c()),
    ("etc_clf", ETC_c()),
    ("xgb_clf", XGB_t()),
], voting='soft')

NameError: name 'MNB_tfidf_Classifier' is not defined