In [None]:
import pandas as pd
import numpy as np
from random import randrange
from collections import Counter
from scipy.stats.stats import pearsonr

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import sklearn
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC


In [None]:
# input: pre-computed feature file, should contain the label column as "state"
df = pd.read_csv('./data/testing_input.csv') 
df = df.fillna(0)

In [None]:
# process data
new_df = df.loc[:, 'QueryOrder':]
labels = df['state'].tolist()

In [None]:
# 3 classes classifier

def classify3(features, labels, clf_name, rand = 0):
    performance_columns = ['random_state','clf_name', 'exploration_p', 'exploration_r', 'exploration_f1', 
                           'exploitation_p', 'exploitation_r', 'exploitation_f1', 
                           'knowitem_p', 'knowitem_r', 'knowitem_f1', 
                           'avg_p', 'avg_r', 'avg_f1', 'accu']
#     clf_name = "rf"
    if rand == 0:
        rand_num = randrange(1000)
    else:
        rand_num = rand
    
    titles = list(features.columns)
    feature_importances = pd.DataFrame(columns=titles)
    fold_performances = pd.DataFrame(columns=performance_columns[1:])
    kf = KFold(n_splits=10)#, shuffle=True, random_state = rand_num)
    X, y = features, np.array(labels)
    y_pred_all = []

    if clf_name == 'nb':
        clf = GaussianNB()
    elif clf_name == 'dt':
        clf = tree.DecisionTreeClassifier(random_state = rand_num)
    elif clf_name == 'knn':
        clf = KNeighborsClassifier(n_neighbors=5)
    elif clf_name == 'svm':
        clf = svm.SVC()
    elif clf_name == 'rf':
        clf = RandomForestClassifier(random_state = rand_num)
    elif clf_name == 'lr':
        clf = LogisticRegression(random_state= rand_num)
    
    for train, test in kf.split(X):
        X_train, X_test= X.loc[train,:], X.loc[test,:]
        y_train, y_test = y[train], y[test]
        
        clf.fit(X_train, y_train)
        
        #importance
        if clf_name == 'rf':
            feature_importance = pd.DataFrame([clf.feature_importances_], index = [1], columns=titles)
            feature_importances = feature_importances.append(feature_importance, ignore_index = True)
        
        #predict
        y_pred = clf.predict(X_test)
        y_pred_all = y_pred_all + y_pred.tolist()
        
        fold_accu = accuracy_score(y_test, y_pred)
        fold_1_f1, fold_2_f1, fold_3_f1=  f1_score(y_test, y_pred, average=None)
        fold_1_p, fold_2_p, fold_3_p= precision_score(y_test, y_pred, average = None)
        fold_1_r, fold_2_r, fold_3_r = recall_score(y_test, y_pred, average = None)
        fold_avg_p = precision_score(y_test, y_pred, average = 'macro')
        fold_avg_r = recall_score(y_test, y_pred, average = 'macro')
        fold_avg_f1 =  f1_score(y_test, y_pred, average='macro')
        
        fold_performance = pd.DataFrame([[clf_name, fold_1_p, fold_1_r, fold_1_f1, fold_2_p, fold_2_r, fold_2_f1, 
                                   fold_3_p, fold_3_r, fold_3_f1, 
                                  fold_avg_p, fold_avg_r, fold_avg_f1, fold_accu]], 
                                        index = [fold_performances.shape[0]+1], columns=performance_columns[1:])
        
        fold_performances = fold_performances.append(fold_performance)
    
    accu = accuracy_score(y, y_pred_all)
    exploration_f1, exploitation_f1, knownitem_f1 =  f1_score(y, y_pred_all, average=None)
    exploration_p, exploitation_p, knownitem_p = precision_score(y, y_pred_all, average = None)
    exploration_r, exploitation_r, knownitem_r = recall_score(y, y_pred_all, average = None)
    avg_p = precision_score(y, y_pred_all, average = 'macro')
    avg_r = recall_score(y, y_pred_all, average = 'macro')
    avg_f1 =  f1_score(y, y_pred_all, average='macro')
    
    performance = pd.DataFrame([[rand_num, clf_name, exploration_p, exploration_r, exploration_f1, 
                                 exploitation_p, exploitation_r, exploitation_f1, 
                                 knownitem_p, knownitem_r, knownitem_f1,
                    avg_p, avg_r, avg_f1, accu]], index = [1],columns=performance_columns)
    
    return performance, feature_importances, fold_performances, confusion_matrix(y, y_pred_all)

In [None]:
performances3, feature_importances3, fold_performances3, matrix = classify3(new_df, labels, 'rf')

for clf in ['nb','lr','svm','knn','dt']:#,'mlp']:
    print(matrix)
    performance3, feature_importances3_tmp, fold_performances3, matrix = classify3(new_df, labels, clf)
    performances3 = performances3.append(performance3,ignore_index=True)

In [None]:
performances3

In [None]:
avg_imp = feature_importances3.mean().sort_values(ascending=False)
avg_imp.plot.bar(figsize=(20,3))

In [None]:
avg_imp.to_csv('./test_importance_output.csv')