# Import Libraries

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from performance_metrics import *
from sklearn.model_selection import train_test_split
import pandas as pd
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

# Load and Split Dataset

In [3]:
file_location_1 = 'Data/project3_dataset1.txt'
file_location_2 = 'Data/project3_dataset2.txt'
    

dataset_1 = pd.read_csv(file_location_1, header=None, sep='\t')
dataset_2 = pd.read_csv(file_location_2, header=None, sep='\t')
    
dataset_2[4] = dataset_2[4].replace(['Present','Absent'], [0,1]) 
    
dataset_1 = dataset_1.to_numpy()
dataset_2 = dataset_2.to_numpy()
#print(len(dataset_1))    
data_1 = dataset_1[:,:-1]
label_1 = np.array(dataset_1[:,-1], dtype='int')
    
    
data_2 = dataset_2[:,:-1]
label_2 = np.array(dataset_2[:,-1], dtype='int')
    
# We consider 80% data from training and remaining 20% for testing    
X1_train, X1_test, y1_train, y1_test = train_test_split(data_1, label_1, test_size=0.20, random_state=42, stratify=label_1)
data_2 = dataset_2[:,:-1]
label_2 = np.array(dataset_2[:,-1], dtype='int')

X2_train, X2_test, y2_train, y2_test = train_test_split(data_2, label_2, test_size=0.20, random_state=42, stratify=label_2)

# Logistic Regression

In [126]:
params = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    'max_iter': [10, 50, 100]
}

log_reg = LogisticRegression(random_state=43)
GS = GridSearchCV(estimator=log_reg,param_grid=params,cv=10)

### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')

Best Parameters: {'C': 100.0, 'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear'}



In [127]:
#training the Logistic Regression classifier with best parameters and whole training set 
log_reg = LogisticRegression(random_state=43, penalty='l1', C=100, solver='liblinear', max_iter = 50)
log_reg.fit(X1_train, y1_train)
y_pred = log_reg.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.956140350877193, 'precision': 0.9560729421281235, 'recall': 0.956140350877193, 'f1': 0.9560273762928301, 'roc_auc': 0.9503968253968254}


In [128]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(log_reg, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.055
Bias: 0.033
Variance: 0.022


In [129]:
### For dataset 2
GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

Best Score: 0.7288288288288288


In [142]:
#training the Logistic Regression classifier with best parameters and whole training set 
log_reg = LogisticRegression(random_state=43, penalty='l2', C=10, solver='lbfgs', max_iter = 100)
log_reg.fit(X2_train, y2_train)
y_pred = log_reg.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)

{'accuracy': 0.6989247311827957, 'precision': 0.6855228299828581, 'recall': 0.6989247311827957, 'f1': 0.6866832092638544, 'roc_auc': 0.6367827868852459}


In [143]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(log_reg, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.275
Bias: 0.215
Variance: 0.060


# KNN

In [162]:
knn = KNeighborsClassifier()

k_range = list(range(1, 31))
params = dict(n_neighbors=k_range)

  
GS = GridSearchCV(estimator=knn,param_grid=params,cv=10)

### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')


Best Parameters: {'n_neighbors': 7}



In [163]:
#training the Logistic Regression classifier with best parameters and whole training set 
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X1_train, y1_train)
y_pred = knn.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.9210526315789473, 'precision': 0.9234172387490467, 'recall': 0.9210526315789473, 'f1': 0.9215739274819476, 'roc_auc': 0.9226190476190476}


In [164]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(knn, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.083
Bias: 0.060
Variance: 0.023


In [165]:
### For dataset 2
GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'n_neighbors': 16}

Best Score: 0.6695945945945946


In [176]:
#training the Logistic Regression classifier with best parameters and whole training set 
knn = KNeighborsClassifier(n_neighbors = 20)
knn.fit(X2_train, y2_train)
y_pred = knn.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)

{'accuracy': 0.7204301075268817, 'precision': 0.7277313155449356, 'recall': 0.7204301075268817, 'f1': 0.6765795161978715, 'roc_auc': 0.6160348360655737}


In [177]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(knn, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.289
Bias: 0.204
Variance: 0.085


# SVM

In [12]:
svc = SVC(random_state=43)

#params = {'C':[0.1,1,100,1000],
#         'kernel':['rbf','poly','sigmoid','linear'],
#         'degree':[1,2,3,4,5,6],
#         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

params = {'C':[0.1,1,100], 
         'kernel':['rbf','sigmoid','linear'],
         'degree':[2,4,6],
         'gamma': [0.1, 0.001, 0.0001]}

GS = GridSearchCV(estimator=svc,param_grid=params,cv=10)

#### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'C': 1, 'degree': 2, 'gamma': 0.1, 'kernel': 'linear'}

Best Score: 0.9647342995169081


In [10]:
#training the decision tree with best parameters and whole training set 
svc = SVC(random_state=43, C = 1, kernel = 'linear' , degree = 2, gamma = 0.0001)
svc.fit(X1_train, y1_train)
y_pred = svc.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.956140350877193, 'precision': 0.9565052493664558, 'recall': 0.956140350877193, 'f1': 0.956244993396696, 'roc_auc': 0.9553571428571429}


In [11]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(svc, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.051
Bias: 0.033
Variance: 0.018


In [13]:
#### For dataset 2
GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'C': 0.1, 'degree': 2, 'gamma': 0.1, 'kernel': 'linear'}

Best Score: 0.7126126126126127


In [34]:
#training the decision tree with best parameters and whole training set 
svc = SVC(random_state=43, C = 0.1, kernel = 'linear', degree = 2 , gamma = 0.1 )
svc.fit(X2_train, y2_train)
y_pred = svc.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)

{'accuracy': 0.7311827956989247, 'precision': 0.7210245107860817, 'recall': 0.7311827956989247, 'f1': 0.7184228395752651, 'roc_auc': 0.6688012295081966}


In [35]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(svc, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.259
Bias: 0.196
Variance: 0.064


# Decision Tree

In [40]:
decision_tree = DecisionTreeClassifier(random_state=43)

params = {'max_depth':[3,5,7,10,15],
          'min_samples_leaf':[3,5,10,15,20],
          'min_samples_split':[8,10,12,18,20,16],
          'criterion':['gini','entropy']}
GS = GridSearchCV(estimator=decision_tree,param_grid=params,cv=10)

#### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 8}

Best Score: 0.9471014492753623


In [69]:
#training the decision tree with best parameters and whole training set 
decision_tree = DecisionTreeClassifier(random_state=43, criterion='gini', max_depth=7, min_samples_leaf=5, min_samples_split=8)
decision_tree.fit(X1_train, y1_train)
y_pred = decision_tree.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.9210526315789473, 'precision': 0.9215784301894598, 'recall': 0.9210526315789473, 'f1': 0.9212409881140532, 'roc_auc': 0.9176587301587301}


In [70]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(decision_tree, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.080
Bias: 0.040
Variance: 0.040


In [43]:
### For dataset 2

GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)


Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 20, 'min_samples_split': 8}

Best Score: 0.7152402402402404


In [85]:
#training the decision tree with best parameters and whole training set 
decision_tree = DecisionTreeClassifier(random_state=43, criterion='gini', max_depth=7, min_samples_leaf=20, min_samples_split=8)
decision_tree.fit(X2_train, y2_train)
y_pred = decision_tree.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)


{'accuracy': 0.6236559139784946, 'precision': 0.626474421635712, 'recall': 0.6236559139784946, 'f1': 0.6249957276350238, 'roc_auc': 0.5868340163934426}


In [86]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(decision_tree, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.339
Bias: 0.187
Variance: 0.152


# Random Forest

In [20]:
ran_forest = RandomForestClassifier(random_state=43)
params = {
          'bootstrap': [True, False],
          'max_depth':[7,10,15],
          'min_samples_leaf':[3, 4, 5],
          'min_samples_split':[8,10,12],
          'max_features': ['auto', 'sqrt'],
          'criterion':['gini','entropy'],
          'n_estimators': [100, 200, 300, 1000],
          }

GS = GridSearchCV(estimator=ran_forest,param_grid=params,cv=10)

#### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 300}

Best Score: 0.9713526570048309


In [99]:
#training the random forest with best parameters and whole training set 
ran_forest = RandomForestClassifier(random_state=43, bootstrap=False, max_depth=7, min_samples_leaf=3, min_samples_split=8, max_features='auto', criterion='entropy', n_estimators=300)
ran_forest.fit(X1_train, y1_train)
y_pred = ran_forest.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.9649122807017544, 'precision': 0.9658578263841422, 'recall': 0.9649122807017544, 'f1': 0.9650731808230041, 'roc_auc': 0.9672619047619049}


In [100]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(ran_forest, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.044
Bias: 0.029
Variance: 0.015


In [24]:
### For dataset 2

GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}

Best Score: 0.707057057057057


In [107]:
#training the decision tree with best parameters and whole training set 
ran_forest = RandomForestClassifier(random_state=43, bootstrap=True, max_depth=7, min_samples_leaf=3, min_samples_split=8, max_features='auto', criterion='entropy', n_estimators=100)
ran_forest.fit(X2_train, y2_train)
y_pred = ran_forest.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)

{'accuracy': 0.6774193548387096, 'precision': 0.6588327619677006, 'recall': 0.6774193548387096, 'f1': 0.6597697404149017, 'roc_auc': 0.6055327868852459}


In [108]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(ran_forest, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.303
Bias: 0.227
Variance: 0.075


# AdaBoost

In [35]:
dtc = DecisionTreeClassifier(random_state = 43, max_depth=1)
abc = AdaBoostClassifier(base_estimator = dtc)

params = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'learning_rate': [0.01, 0.1, 1],
}

GS = GridSearchCV(estimator=abc,param_grid=params,cv=10)

#### For dataset 1
GS.fit(X1_train, y1_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'learning_rate': 1, 'n_estimators': 300}

Best Score: 0.9734299516908212


In [25]:
#training the adaboost with best parameters and whole training set 
abc = AdaBoostClassifier(learning_rate= 1, n_estimators= 300)
abc.fit(X1_train, y1_train)
y_pred = abc.predict(X1_test)
performance = calculate_performance(y_pred, y1_test)

print(performance)

{'accuracy': 0.9824561403508771, 'precision': 0.9824561403508771, 'recall': 0.9824561403508771, 'f1': 0.9824561403508771, 'roc_auc': 0.9811507936507938}


In [26]:
# bias and varience calculation for dataset 1
mse, bias, var = bias_variance_decomp(abc, X1_train, y1_train, X1_test, y1_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.037
Bias: 0.017
Variance: 0.020


In [37]:
### For dataset 2

GS.fit(X2_train, y2_train)

print('Best Parameters:',GS.best_params_,end='\n\n')
print('Best Score:',GS.best_score_)

Best Parameters: {'learning_rate': 0.01, 'n_estimators': 500}

Best Score: 0.7317567567567568


In [27]:
#training the decision tree with best parameters and whole training set 
abc = AdaBoostClassifier(learning_rate = 0.01, n_estimators= 500)
abc.fit(X2_train, y2_train)
y_pred = abc.predict(X2_test)
performance = calculate_performance(y_pred, y2_test)

print(performance)

{'accuracy': 0.7204301075268817, 'precision': 0.7103942652329749, 'recall': 0.7204301075268817, 'f1': 0.6956799493991145, 'roc_auc': 0.6383196721311475}


In [28]:
# bias and varience calculation for dataset 2
mse, bias, var = bias_variance_decomp(abc, X2_train, y2_train, X2_test, y2_test, loss='mse', num_rounds=200, random_seed=43)
# summarize results
print('MSE: %.3f' % mse)
print('Bias: %.3f' % bias)
print('Variance: %.3f' % var)

MSE: 0.283
Bias: 0.206
Variance: 0.077
