In [1]:
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_train_features(file_name):
    data = pd.read_csv(file_name, header = None)
    data = data.drop(data.columns[0], axis=1)
    return data

def get_test_features(file_name):
    data = pd.read_csv(file_name, header = None)
    indexes  = data.iloc[:,0]
    data = data.drop(data.columns[0], axis=1)
    return indexes, data

def get_labels(file_name):
    data = pd.read_csv(file_name)
    data = data.drop(data.columns[0], axis=1)
    return data.values.ravel()

In [3]:
#Loading and Normalizing the data
train_features_file = "train_features.csv"
train_features = get_train_features(train_features_file)
train_features = preprocessing.normalize(train_features)

train_label_file = "train_labels.csv"
train_labels = get_labels(train_label_file)

test_features_file = "test_features.csv"
indexes, test_features = get_test_features(test_features_file)
test_features = preprocessing.normalize(test_features)

In [4]:
#Calculation of Confusion Matrix
def confusion_matrix(original, predicted):
    accuracy = precision = recall = f_measure = 0
    TP = FN = FP = TN = 0
    for i in range(len(original)):
        if original[i] == 1 and predicted[i] == 1:
            TP += 1
        elif original[i] == 1 and predicted[i] == 0:
            FN += 1
        elif original[i] == 0 and predicted[i] == 1:
            FP += 1
        else:
            TN += 1
            
    accuracy = (float(TP + TN)/(TP + FN + FP + TN))
    f_measure = (float(2 * TP) / ((2 * TP) + FN + FP))
            
    return accuracy, f_measure

In [224]:
#K nearest neighbors
string = "knn"
knn = KNeighborsClassifier(n_neighbors=13)

#Train and Test Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_knn_pred = knn.fit(X_train,y_train).predict(X_val)
acc_knn,f1_knn = confusion_matrix(y_val,Y_knn_pred)
print('Accuracy for training and validation sets for Knn : '+ str(acc_knn * 100))
print('F1-score for training and validation sets for Knn : '+ str(f1_knn * 100))


#Test Submission
Y_knn = knn.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_knn)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for Knn : 69.0
F1-score for training and validation sets for Knn : 80.98159509202453


In [231]:
#Bagging
string = "bagging"
bag = BaggingClassifier(n_estimators=8, max_samples=1.0, max_features=1.0)

#Train and Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_bag_pred = bag.fit(X_train,y_train).predict(X_val)
acc_bag,f1_bag = confusion_matrix(y_val,Y_bag_pred)
print('Accuracy for training and validation sets for Bagging : '+ str(acc_bag * 100))
print('F1-score for training and validation sets for Bagging : '+ str(f1_bag * 100))

#Test Submission
Y_bag = bag.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_bag)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for Bagging : 75.0
F1-score for training and validation sets for Bagging : 82.75862068965517


In [192]:
#Decision Tree
string = "decision_tree"

tree = DecisionTreeClassifier(criterion = 'gini')

#Train and Test Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_dt_pred = tree.fit(X_train,y_train).predict(X_val)
acc_dt,f1_dt = confusion_matrix(y_val,Y_dt_pred)
print('Accuracy for training and validation sets for decision tree : '+ str(acc_dt * 100))
print('F1-score for training and validation sets for decision tree : '+ str(f1_dt * 100))

#Test Submission
Y_dt = tree.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_dt)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for decision tree : 77.0
F1-score for training and validation sets for decision tree : 84.35374149659864


In [243]:
#Random Forest
string = "Random_Forest"

rfc = RandomForestClassifier(criterion='gini', n_estimators = 10) 

#Train and Test Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_rf_pred = rfc.fit(X_train,y_train).predict(X_val)
acc_rf,f1_rf = confusion_matrix(y_val,Y_rf_pred)
print('Accuracy for training and validation sets for random forest : '+ str(acc_rf * 100))
print('F1-score for training and validation sets for random forest : '+ str(f1_rf * 100))

#Test Submission
Y_rf = rfc.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_rf)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for random forest : 77.0
F1-score for training and validation sets for random forest : 85.16129032258064


In [233]:
#Adaboost Classifier
string = "AdaBoost"
AdBC= AdaBoostClassifier(n_estimators=5, learning_rate=0.2)
# Y_adbc = AdBC.fit(train_features,train_labels).predict(test_features)

X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_adbc_pred = AdBC.fit(X_train,y_train).predict(X_val)
acc_adbc,f1_adbc = confusion_matrix(y_val,Y_adbc_pred)
print('Accuracy for training and validation sets for Adaboost Classifier : '+ str(acc_adbc * 100))
print('F1-score for training and validation sets for Adaboost Classifier : '+ str(f1_adbc * 100))

Y_adbc = AdBC.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_adbc)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for Adaboost Classifier : 71.0
F1-score for training and validation sets for Adaboost Classifier : 83.04093567251462


In [234]:
#GridSearchAdaboost with varied number of estimators and learning rates
string = 'GridSearch_Adaboost'
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

#Train and Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
model = grid_search.fit(X_train, y_train)
y_pred_val = model.predict(X_val)
acc_grid,f1_grid = confusion_matrix(y_val,y_pred_val)
print('Accuracy for training and validation sets for Grid Search : '+ str(acc_grid * 100))
print('F1-score for training and validation sets for Grid Search : '+ str(f1_grid * 100))

#Test Submission
y_pred = grid_search.fit(train_features,train_labels).predict(test_features)
y_pred = pd.DataFrame(y_pred)
y_pred['id'] = y_pred.index
y_pred['label'] = y_pred[0]
del y_pred[0]
y_pred.to_csv(string+'_predicted_labels.csv',index = False)

Accuracy for training and validation sets for Grid Search : 75.0
F1-score for training and validation sets for Grid Search : 83.44370860927152


In [5]:
#Multiple Adaboost Regressors 
def get_value(predicted):
    res1 = []
    for i1 in Y1:
        _ = res1.append(1) if i1 >= 0.5 else res1.append(0)
    return res1

string = "AdaBoostRegressor"

X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)

AdbcR1  = AdaBoostRegressor(n_estimators =1300,loss='exponential',learning_rate=1)
# Y1 = AdbcR1.fit(train_features,train_labels).predict(test_features)
# Y1 = get_value(Y1)

Y1 = AdbcR1.fit(X_train,y_train).predict(X_val)
Y1 = get_value(Y1)
    
AdbcR2  = AdaBoostRegressor(n_estimators =1300,loss='square',learning_rate=1)
# Y2 = AdbcR2.fit(train_features,train_labels).predict(test_features)
# Y2 = get_value(Y2)

Y2 = AdbcR2.fit(X_train,y_train).predict(X_val)
Y2 = get_value(Y2)


y_ada_regress = []
for x in range(len(Y1)):
    zeroes = ones = 0
    if(Y1[x] == 0):
        zeroes += 1
    else:
        ones += 1

    if(Y2[x] == 0):
        zeroes += 1
    else:
        ones += 1
        
    _ = y_ada_regress.append(1) if ones > zeroes else y_ada_regress.append(0) 
    
acc_adbcR,f1_adbcR = confusion_matrix(y_val,y_ada_regress)

print('Accuracy for training and validation sets for Adaboost Regressor : '+ str(acc_adbcR * 100))
print('F1-score for training and validation sets for Adaboost Regressor : '+ str(f1_adbcR * 100))

final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(y_ada_regress)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for Adaboost Regressor : 78.0
F1-score for training and validation sets for Adaboost Regressor : 84.93150684931507


In [255]:
#NaiveBayes_bernoulli
string = "naivebayes_bernoulli"

nb = BernoulliNB(alpha = 1e-02, binarize=0.0, fit_prior=True, class_prior=None)

#Train and Test Validation Accuracy
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.25, random_state=1)
Y_nb_pred = nb.fit(X_train,y_train).predict(X_val)
acc_nb,f1_nb = confusion_matrix(y_val,Y_nb_pred)
print('Accuracy for training and validation sets for Naive Bayes Bernoulli : '+ str(acc_nb * 100))
print('F1-score for training and validation sets for Naive Bayes Bernoulli : '+ str(f1_nb * 100))

#Test Submission
Y_nb = nb.fit(train_features,train_labels).predict(test_features)
final_df = pd.concat([pd.DataFrame(indexes), pd.DataFrame(Y_nb)], axis=1)
final_df.columns=["id","label"]
final_df.to_csv(string+"_accuracy.csv",index=False)

Accuracy for training and validation sets for Naive Bayes Bernoulli : 81.0
F1-score for training and validation sets for Naive Bayes Bernoulli : 87.24832214765101
