In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,auc,roc_curve,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
## loading training dataset and testing dataset
train_bin = pd.read_csv('TrainingDataMulti.csv',header=None)
test_bin = pd.read_csv('TestingDataMulti.csv',header=None)
train_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,70.399324,127673.0908,-49.572308,127648.0176,-169.578319,127723.2374,65.689611,605.91099,-57.003571,626.78553,...,0,0,0,0,0,0,0,0,0,0
1,73.688102,130280.7109,-46.300719,130255.6377,-166.278082,130355.9307,71.831719,483.59351,-50.947407,500.98896,...,0,0,0,0,0,0,0,0,0,0
2,73.733939,130305.7842,-46.254883,130280.7109,-166.232245,130381.004,71.8088,483.59351,-50.91303,500.98896,...,0,0,0,0,0,0,0,0,0,0
3,74.083443,130581.5902,-45.899649,130556.5169,-165.882741,130656.81,72.152575,482.86107,-50.437475,499.15786,...,0,0,0,0,0,0,0,0,0,0
4,74.553268,131083.0556,-45.424094,131057.9823,-165.424375,131158.2754,72.118198,484.50906,-50.013486,497.69298,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-100.141563,132336.7191,139.881916,132311.6458,19.875906,132411.9389,-90.590357,231.45104,142.431578,254.33979,...,0,0,0,0,0,0,0,0,0,0
1,-100.267614,132311.6458,139.732947,132311.6458,19.738396,132411.9389,-92.280582,236.94434,141.944564,255.43845,...,0,0,0,0,0,0,0,0,0,0
2,-100.301992,132336.7191,139.710029,132286.5725,19.726937,132411.9389,-92.58998,237.67678,141.881539,255.43845,...,0,0,0,0,0,0,0,0,0,0
3,-100.468149,132336.7191,139.55533,132286.5725,19.560779,132411.9389,-94.331771,242.98697,141.222637,256.53711,...,0,0,0,0,0,0,0,0,0,0
4,-100.479609,132336.7191,139.538141,132311.6458,19.537861,132411.9389,-94.417715,243.35319,141.171071,256.354,...,0,0,0,0,0,0,0,0,0,0


In [6]:
## extract data and label information from original dataset

x_full = train_bin.iloc[:,:-1]
y_full = train_bin.iloc[:,-1]

test_full = test_bin.values

In [7]:
## data preprocessing using standardization

scaler = StandardScaler()
scaler.fit(x_full)
x_full = scaler.transform(x_full)
test_full = scaler.transform(test_full)

In [8]:
x_full.shape

(6000, 128)

In [9]:
## split the original dataset into two parts, x_train,y_train will be used for training models
## x_test,y_test will be used for evaluted the model's performance.
x_train,x_test,y_train,y_test = train_test_split(x_full,y_full,test_size=0.2,random_state=1)

In [10]:
## res record the metrics accuracy,precision score, recall score, f1 score
## cm_list record the confusion matrix
## predprob record the predicted probability it will be used fo drawing roc curve.
## clapse record the running time for different algorithms

res = []
cm_list = {}
predprob = []
clapse = []

In [12]:
## logistic regression
start = time.time()
lr = LogisticRegression(random_state=1,solver='liblinear')
lr_param = {'penalty':['l1','l2','elasticnet'],'C':[0.001,0.01,0.1,1,10,100,1000]}
clf_lr = GridSearchCV(lr,lr_param,cv=5,n_jobs=-1)

clf_lr.fit(x_train,y_train)
print(f'Best logistic regression parameters are:{clf_lr.best_params_}')

best_lr = clf_lr.best_estimator_
best_lr.fit(x_train,y_train)

lr_pred = best_lr.predict(x_test)
lr_predprob = best_lr.predict_proba(x_test)

lr_acc = accuracy_score(y_test,lr_pred)
lr_precision = precision_score(y_test,lr_pred,average='macro')
lr_recall = recall_score(y_test,lr_pred,average='macro')
lr_f1 = f1_score(y_test,lr_pred,average='macro')
lr_cm = confusion_matrix(y_test,lr_pred)

res.append([lr_acc,lr_precision,lr_recall,lr_f1])
cm_list['logistic regression'] = lr_cm
predprob.append(lr_predprob)
clapse.append(time.time()-start)

Best logistic regression parameters are:{'C': 100, 'penalty': 'l1'}


In [13]:
lr_acc,lr_precision,lr_recall,lr_f1

(0.7075, 0.6437507378420982, 0.628955181820495, 0.6318819668606325)

In [14]:
## decision tree
start = time.time()
dt = DecisionTreeClassifier(random_state=1)
dt_param = {'criterion':["gini","entropy "],
          'max_depth':range(4,30),
          'min_samples_split':range(1,4),
          'min_samples_leaf':range(1,4),
          'max_features':['auto', 'sqrt', 'log2']
         }

clf_dt = GridSearchCV(dt,dt_param,cv=5,n_jobs=-1)
clf_dt.fit(x_train,y_train)
print(f'Best decision tree parameters are:\n {clf_dt.best_params_}')

best_dt = clf_dt.best_estimator_
best_dt.fit(x_train,y_train)

dt_pred = best_dt.predict(x_test)
dt_predprob = best_dt.predict_proba(x_test)

dt_acc = accuracy_score(y_test,dt_pred)
dt_precision = precision_score(y_test,dt_pred,average='macro')
dt_recall = recall_score(y_test,dt_pred,average='macro')
dt_f1 = f1_score(y_test,dt_pred,average='macro')
dt_cm = confusion_matrix(y_test,dt_pred)

res.append([dt_acc,dt_precision,dt_recall,dt_f1])
cm_list['decision tree'] = dt_cm
predprob.append(dt_predprob)
clapse.append(time.time()-start)

Best decision tree parameters are:
 {'criterion': 'gini', 'max_depth': 22, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}


In [15]:
dt_acc,dt_precision,dt_recall,dt_f1

(0.895, 0.8791342909914777, 0.8794677680320815, 0.8791155331545522)

In [16]:
## svm
start = time.time()
svm = SVC(random_state=1,probability=True)
svm_param = [{'C':[0.001,0.01,0.1,1,10,100,1000],'kernel':['linear','rbf']},

             {'C':[0.001,0.01,0.1,1,10,100,1000],'kernel':['poly'],'degree':[2,3,4]}
            ]
clf_svm = GridSearchCV(svm,svm_param,cv=5,n_jobs=-1)
clf_svm.fit(x_train,y_train)
print(f'SVM best parameters are:\n {clf_svm.best_params_}')

best_svm = clf_svm.best_estimator_
best_svm.fit(x_train,y_train)

svm_pred = best_svm.predict(x_test)
svm_predprob = best_svm.predict_proba(x_test)

svm_acc = accuracy_score(y_test,svm_pred)
svm_precision = precision_score(y_test,svm_pred,average='macro')
svm_recall = recall_score(y_test,svm_pred,average='macro')
svm_f1 = f1_score(y_test,svm_pred,average='macro')
svm_cm = confusion_matrix(y_test,svm_pred)

res.append([svm_acc,svm_precision,svm_recall,svm_f1])
cm_list['svm'] = svm_cm
predprob.append(svm_predprob)
clapse.append(time.time()-start)

SVM best parameters are:
 {'C': 1000, 'kernel': 'rbf'}


In [17]:
svm_acc,svm_precision,svm_recall,svm_f1

(0.8625, 0.85030024382652, 0.8282645532001641, 0.8359264743091153)

In [18]:
## show the performance of these three algorithms.
res = pd.DataFrame(data=res,columns=['accuracy','precision','recall','f1 score'],index=['logistic regression','decision tree','svm'])
res

Unnamed: 0,accuracy,precision,recall,f1 score
logistic regression,0.7075,0.643751,0.628955,0.631882
decision tree,0.895,0.879134,0.879468,0.879116
svm,0.8625,0.8503,0.828265,0.835926


In [29]:
## finally training the model on the full labeled dataset and compute the labels for testing dataset.
best_dt.fit(x_full,y_full)
pred = best_dt.predict(test_full)
out = test_bin.copy()
out[128] = pred
out.to_csv("TestingResultsMulti.csv", index=False)

In [30]:
pred

array([2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 2, 2, 2, 0, 2, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 2, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)