In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_bin = pd.read_csv('TrainingDataMulti.csv',header=None)
test_bin = pd.read_csv('TestingDataMulti.csv',header=None)
train_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,70.399324,127673.0908,-49.572308,127648.0176,-169.578319,127723.2374,65.689611,605.91099,-57.003571,626.78553,...,0,0,0,0,0,0,0,0,0,0
1,73.688102,130280.7109,-46.300719,130255.6377,-166.278082,130355.9307,71.831719,483.59351,-50.947407,500.98896,...,0,0,0,0,0,0,0,0,0,0
2,73.733939,130305.7842,-46.254883,130280.7109,-166.232245,130381.004,71.8088,483.59351,-50.91303,500.98896,...,0,0,0,0,0,0,0,0,0,0
3,74.083443,130581.5902,-45.899649,130556.5169,-165.882741,130656.81,72.152575,482.86107,-50.437475,499.15786,...,0,0,0,0,0,0,0,0,0,0
4,74.553268,131083.0556,-45.424094,131057.9823,-165.424375,131158.2754,72.118198,484.50906,-50.013486,497.69298,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-100.141563,132336.7191,139.881916,132311.6458,19.875906,132411.9389,-90.590357,231.45104,142.431578,254.33979,...,0,0,0,0,0,0,0,0,0,0
1,-100.267614,132311.6458,139.732947,132311.6458,19.738396,132411.9389,-92.280582,236.94434,141.944564,255.43845,...,0,0,0,0,0,0,0,0,0,0
2,-100.301992,132336.7191,139.710029,132286.5725,19.726937,132411.9389,-92.58998,237.67678,141.881539,255.43845,...,0,0,0,0,0,0,0,0,0,0
3,-100.468149,132336.7191,139.55533,132286.5725,19.560779,132411.9389,-94.331771,242.98697,141.222637,256.53711,...,0,0,0,0,0,0,0,0,0,0
4,-100.479609,132336.7191,139.538141,132311.6458,19.537861,132411.9389,-94.417715,243.35319,141.171071,256.354,...,0,0,0,0,0,0,0,0,0,0


In [4]:
x_full = train_bin.iloc[:,:-1]
y_full = train_bin.iloc[:,-1]

test_full = test_bin.iloc[:,:]

## Part A

In [5]:
scaler = StandardScaler()
scaler.fit(x_full)
x_full = scaler.transform(x_full)
test_full = scaler.transform(test_full)

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x_full,y_full,test_size=0.2,random_state=1)

In [7]:
acc = []
precision = []
recall = []
f1 = []

In [9]:
lr = LogisticRegression(random_state=1)
lr.fit(x_train,y_train)

lr_pred = lr.predict(x_test)
lr_predprob = lr.predict_proba(x_test)

lr_acc = accuracy_score(y_test,lr_pred)
lr_precision = precision_score(y_test,lr_pred,average='macro')
lr_recall = recall_score(y_test,lr_pred,average='macro')
lr_f1 = f1_score(y_test,lr_pred,average='macro')

acc.append(lr_acc)
precision.append(lr_precision)
recall.append(lr_recall)
f1.append(lr_f1)

In [10]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(x_train,y_train)

dt_pred = dt.predict(x_test)
dt_predprob = dt.predict_proba(x_test)

dt_acc = accuracy_score(y_test,dt_pred)
dt_precision = precision_score(y_test,dt_pred,average='macro')
dt_recall = recall_score(y_test,dt_pred,average='macro')
dt_f1 = f1_score(y_test,dt_pred,average='macro')

acc.append(dt_acc)
precision.append(dt_precision)
recall.append(dt_recall)
f1.append(dt_f1)

In [11]:
svm = SVC(random_state=1,probability=True)
svm.fit(x_train,y_train)

svm_pred = svm.predict(x_test)
svm_predprob = svm.predict_proba(x_test)

svm_acc = accuracy_score(y_test,svm_pred)
svm_precision = precision_score(y_test,svm_pred,average='macro')
svm_recall = recall_score(y_test,svm_pred,average='macro')
svm_f1 = f1_score(y_test,svm_pred,average='macro')

acc.append(svm_acc)
precision.append(svm_precision)
recall.append(svm_recall)
f1.append(svm_f1)

In [12]:
res = pd.DataFrame(data=None,columns=['accuracy','precision','recall','f1'],index=['logistic regression','decision tree','svm'])
res['accuracy'] = np.round(acc,4)
res['precision'] = np.round(precision,4)
res['recall'] = np.round(recall,4)
res['f1'] = np.round(f1,4)
res

Unnamed: 0,accuracy,precision,recall,f1
logistic regression,0.7033,0.6501,0.6294,0.634
decision tree,0.8933,0.8748,0.8799,0.8772
svm,0.7383,0.71,0.6591,0.6701


In [14]:
dt.fit(x_full,y_full)
test_pred = dt.predict(test_full)
print(test_pred)
pd.DataFrame(data=test_pred).to_csv('TestingResultsMulti.csv',index=False)

[2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 1 1 1 1
 1 2 2 2 2 2 0 2 1 2 2 2 2 2 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 2 0 0 0 0 0 0 0 0]
