In [297]:
import numpy as np
import sklearn
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, f1_score, precision_score
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [298]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Following code can be used to test any algorithm


## Algorithms:

1. Logistic Regression
2. Naive Bayes
3. Random Forest
4. Adaboost
5. XGboost
6. FC
7. SVM

In [332]:
class model:
    def __init__(self, train, test):
        ## Train and test should be a dictionary, where train["X"] should contain the features in numpy matrix format 
        ## mXn, where m is samples and n is features. the train["y"] should contain the labels in (m,) shape
        self.train = train
        self.test = test
        
    def performance_evaluation(self, GT, pred, modelname="Empty"):
        acc = accuracy_score(GT, pred)
        f1 = f1_score(GT, pred)
        precision = precision_score(GT, pred)
        recall = recall_score(GT, pred)
        print("----- modelname is {}------".format(modelname))
        print("accuracy is {:.4f} and f1 score is {:.4f}".format(acc, f1))
        print("precision is {:.4f} and recall is {:.4f}".format(precision, recall))
        
    def feature_importance(self, model):
        ## feat importance
        imp_acc = []
        for i in range(self.train["X"].shape[1]):
            feat = self.train["X"]
            feat = feat.transpose()
            np.random.shuffle(feat[i])
            feat = feat.transpose()
            train_pred = model.predict(feat)
            acc = accuracy_score(self.train["y"], train_pred)
            imp_acc.append(acc)
            if i%10==0:
                print("we are on ", str(i))
        df = pd.DataFrame()
        df["featname"] = ['f' + str(i) for i in range(self.train["X"].shape[1])]
        df["acc"] = imp_acc
        df = df.sort_values(by=['acc'], ascending=True)
        return df
        
    def naive_bayes(self, compute_feat_importance=False):
        df = pd.DataFrame()
        gnb = GaussianNB()
        y_pred = gnb.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "Gaussian NB")
        if compute_feat_importance:
            df = self.feature_importance(Optimized_model)
        return y_pred, df
    
    def SVM(self, cval_range=[-2,2,4], gammaval_range=[-2, 2, 4], tune=False, nfolds=1, 
            compute_feat_importance=False):
        df = pd.DataFrame()
        if tune==True:
            C = np.logspace(cval_range[0], cval_range[1], cval_range[2])
            gamma = np.logspace(gammaval_range[0], gammaval_range[1], gammaval_range[2])
            Param_tunable = {'C': C, 'gamma': gamma}
            Optimized_model = GridSearchCV(svm.SVC(kernel='rbf'), 
                                           Param_tunable, cv=nfolds, verbose = True, 
                                           n_jobs = -1)
        else:
            Optimized_model = svm.SVC(kernel='rbf')
        y_pred = Optimized_model.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "SVM")
        if compute_feat_importance:
            df = self.feature_importance(Optimized_model)
        return y_pred, df
    
    def random_forest(self, Estimators=[80, 100, 120], tune=False, nfolds=1, compute_feat_importance=False):
        df = pd.DataFrame()
        if tune==True:
            Param_tunable = {'n_estimators': Estimators}
            Optimized_model = GridSearchCV(RandomForestClassifier(), Param_tunable, 
                                           cv=nfolds, verbose = 1, n_jobs = -1)
            
        else:
            Optimized_model = RandomForestClassifier()
        y_pred = Optimized_model.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "Random Forest")
        if compute_feat_importance:
            df = self.feature_importance(Optimized_model)
        return y_pred, df
    
    def xgboost(self):
        df = pd.DataFrame()
        model = XGBClassifier()
        y_pred = model.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "XGBoost")
        if compute_feat_importance:
            df = self.feature_importance(model)
        return y_pred, df
    
    def adaboost(self, Estimators=[80, 100, 120], tune=False, nfolds=1, compute_feat_importance=False):
        df = pd.DataFrame()
        if tune==True:
            Param_tunable = {'n_estimators': Estimators}
            Optimized_model = GridSearchCV(AdaBoostClassifier(random_state=42), Param_tunable, 
                                           cv=nfolds, verbose = 1, n_jobs = -1)
            
        else:
            Optimized_model = AdaBoostClassifier(random_state=42)
        y_pred = Optimized_model.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "Ada Boost")
        if compute_feat_importance:
            df = self.feature_importance(Optimized_model)
        return y_pred, df
    
    def logistic_regression(self, max_iter=100, compute_feat_importance=False):
        df = pd.DataFrame()
        model = LogisticRegression(random_state=0, max_iter=max_iter)
        y_pred = model.fit(self.train["X"], self.train["y"]).predict(self.test["X"])
        self.performance_evaluation(self.test["y"], y_pred, "loisitic regression")
        if compute_feat_importance:
            df = self.feature_importance(model)
        return y_pred, df
        
    
    

In [315]:
path = "/home/sadat/Documents/ml2_server/Han_Experiments/nlp-stuff-master/text_classification_HAN/"
train = dict()
test = dict()
train["X"] = rev_train_X = np.load(path + "results2/Review/Emb_han_Review_train.npy")
train["y"] = rev_train_y = pd.read_pickle(path + "data2/Review/train.pkl").is_deception.values

test["X"] = rev_test_X = np.load(path + "results2/Review/Emb_han_Review_test2.npy")
test["y"] = rev_test_y = pd.read_pickle(path + "data2/Review/test2.pkl").is_deception.values


In [316]:
ml = model(train, test)

In [317]:
y = ml.logistic_regression(max_iter=1000)

----- modelname is loisitic regression------
accuracy is 0.6326 and f1 score is 0.6580
precision is 0.6154 and recall is 0.7068


In [296]:
y = ml.adaboost(Estimators=[80, 100, 120], tune=True, nfolds=5, compute_feat_importance=False)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
----- modelname is Ada Boost------
accuracy is 0.6314 and f1 score is 0.6576
precision is 0.6140 and recall is 0.7080


In [259]:
y = ml.SVM()

----- modelname is SVM------
accuracy is 0.6387 and f1 score is 0.6689
precision is 0.6174 and recall is 0.7298


In [271]:
y, i = ml.naive_bayes()

----- modelname is Gaussian NB------
accuracy is 0.6373 and f1 score is 0.6732
precision is 0.6126 and recall is 0.7472


In [333]:
path = "/home/sadat/Documents/ml2_server/Han_Experiments/nlp-stuff-master/text_classification_HAN/"
train = dict()
test = dict()
train["X"] = rev_train_X = np.concatenate([np.load(path + "results2/Review/Emb_han_Review_train.npy"),
                                           np.load(path + "results2/Tweet/Emb_han_Review_train.npy"),
                                          np.load(path + "results2/News/Emb_han_Review_train.npy")], axis=1)
                                           
train["y"] = rev_train_y = pd.read_pickle(path + "data2/Review/train.pkl").is_deception.values

test["X"] = rev_test_X = np.concatenate([np.load(path + "results2/Review/Emb_han_Review_test2.npy"),
                                           np.load(path + "results2/Tweet/Emb_han_Review_test2.npy"),
                                        np.load(path + "results2/News/Emb_han_Review_test2.npy")], axis=1)
test["y"] = rev_test_y = pd.read_pickle(path + "data2/Review/test2.pkl").is_deception.values

In [334]:
ml = model(train, test)
y, df = ml.logistic_regression(max_iter=1000, compute_feat_importance=True)

----- modelname is loisitic regression------
accuracy is 0.6354 and f1 score is 0.6600
precision is 0.6182 and recall is 0.7080
we are on  0
we are on  10
we are on  20
we are on  30
we are on  40
we are on  50
we are on  60
we are on  70
we are on  80
we are on  90
we are on  100
we are on  110
we are on  120
we are on  130
we are on  140
we are on  150
we are on  160
we are on  170
we are on  180
we are on  190


In [335]:
df

Unnamed: 0,featname,acc
162,f162,0.500561
163,f163,0.500841
164,f164,0.500911
165,f165,0.501121
153,f153,0.501542
...,...,...
4,f4,0.735109
2,f2,0.735319
1,f1,0.735389
3,f3,0.735950


In [277]:
ml = model(train, test)
y, i = ml.naive_bayes()

----- modelname is Gaussian NB------
accuracy is 0.6348 and f1 score is 0.6692
precision is 0.6116 and recall is 0.7388


In [263]:
y = ml.xgboost()

----- modelname is XGBoost------
accuracy is 0.6312 and f1 score is 0.6528
precision is 0.6167 and recall is 0.6934


In [278]:
y = ml.xgboost()

----- modelname is XGBoost------
accuracy is 0.6194 and f1 score is 0.6402
precision is 0.6070 and recall is 0.6771


In [None]:
y = ml.SVM(cval_range=[-2,2,2], gammaval_range=[-2, 2, 2], tune=True, nfolds=2)

In [211]:
y , i= ml.random_forest()

----- modelname is Random Forest------
accuracy is 0.6300 and f1 score is 0.6510
precision is 0.6161 and recall is 0.6900


In [120]:
feature_names = ['f' + str(i) for i in range(64)]

In [228]:
pip install --upgrade xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.9/192.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.
