In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_bin = pd.read_csv('TrainingDataBinary.csv',header=None)
test_bin = pd.read_csv('TestingDataBinary.csv',header=None)
train_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,70.399324,127673.0908,-49.572308,127648.0176,-169.578319,127723.2374,65.689611,605.91099,-57.003571,626.78553,...,0,0,0,0,0,0,0,0,0,0
1,73.688102,130280.7109,-46.300719,130255.6377,-166.278082,130355.9307,71.831719,483.59351,-50.947407,500.98896,...,0,0,0,0,0,0,0,0,0,0
2,73.733939,130305.7842,-46.254883,130280.7109,-166.232245,130381.004,71.8088,483.59351,-50.91303,500.98896,...,0,0,0,0,0,0,0,0,0,0
3,74.083443,130581.5902,-45.899649,130556.5169,-165.882741,130656.81,72.152575,482.86107,-50.437475,499.15786,...,0,0,0,0,0,0,0,0,0,0
4,74.553268,131083.0556,-45.424094,131057.9823,-165.424375,131158.2754,72.118198,484.50906,-50.013486,497.69298,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test_bin.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,12.118057,131810.1804,-107.847846,131183.3486,132.146986,131860.3269,10.089787,370.79775,-109.664122,369.8822,...,0,0,0,0,0,0,0,0,0,0
1,12.049302,131810.1804,-107.92233,131183.3486,132.066772,131860.3269,9.986654,370.79775,-109.899035,370.61464,...,0,0,0,0,0,0,0,0,0,0
2,12.043573,131810.1804,-107.92806,131183.3486,132.061042,131860.3269,9.958006,370.79775,-109.887576,370.61464,...,0,0,0,0,0,0,0,0,0,0
3,12.037843,131810.1804,-107.939519,131183.3486,132.061042,131860.3269,9.980925,370.79775,-109.870387,370.61464,...,0,0,0,0,0,0,0,0,0,0
4,11.94617,131810.1804,-108.031192,131183.3486,131.963639,131860.3269,9.860604,370.79775,-110.059463,370.79775,...,0,0,0,0,0,0,0,0,0,0


In [4]:
x_full = train_bin.iloc[:,:-1]
y_full = train_bin.iloc[:,-1]

test_full = test_bin.iloc[:,:]

## Part A

In [29]:
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(x_full)
x_full = scaler.transform(x_full)
test_full = scaler.transform(test_full)

In [30]:
x_train,x_test,y_train,y_test = train_test_split(x_full,y_full,test_size=0.2,random_state=1)

In [31]:
acc = []
precision = []
recall = []
f1 = []

In [32]:
lr = LogisticRegression(random_state=1)
lr.fit(x_train,y_train)
lr_pred = lr.predict(x_test)
lr_predprob = lr.predict_proba(x_test)
lr_acc = accuracy_score(y_test,lr_pred)
lr_precision = precision_score(y_test,lr_pred)
lr_recall = recall_score(y_test,lr_pred)
lr_f1 = f1_score(y_test,lr_pred)

acc.append(lr_acc)
precision.append(lr_precision)
recall.append(lr_recall)
f1.append(lr_f1)

In [33]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(x_train,y_train)
dt_pred = dt.predict(x_test)
dt_predprob = dt.predict_proba(x_test)
dt_acc = accuracy_score(y_test,dt_pred)
dt_precision = precision_score(y_test,dt_pred)
dt_recall = recall_score(y_test,dt_pred)
dt_f1 = f1_score(y_test,dt_pred)

acc.append(dt_acc)
precision.append(dt_precision)
recall.append(dt_recall)
f1.append(dt_f1)

In [34]:
svm = SVC(random_state=1,probability=True)
svm.fit(x_train,y_train)
svm_pred = svm.predict(x_test)
svm_predprob = svm.predict_proba(x_test)
svm_acc = accuracy_score(y_test,svm_pred)
svm_precision = precision_score(y_test,svm_pred)
svm_recall = recall_score(y_test,svm_pred)
svm_f1 = f1_score(y_test,svm_pred)

acc.append(svm_acc)
precision.append(svm_precision)
recall.append(svm_recall)
f1.append(svm_f1)

In [35]:
res = pd.DataFrame(data=None,columns=['accuracy','precision','recall','f1'],index=['logistic regression','decision tree','svm'])
res['accuracy'] = np.round(acc,4)
res['precision'] = np.round(precision,4)
res['recall'] = np.round(recall,4)
res['f1'] = np.round(f1,4)
res

Unnamed: 0,accuracy,precision,recall,f1
logistic regression,0.8817,0.9173,0.8458,0.8801
decision tree,0.9417,0.949,0.9367,0.9428
svm,0.8958,0.924,0.8685,0.8954


In [36]:
dt.fit(x_full,y_full)
test_pred = dt.predict(test_full)
print(test_pred)
pd.DataFrame(data=test_pred).to_csv('TestingResultsBinary.csv',index=False)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
