In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_score
from scipy import stats

In [2]:
df_train = pd.read_csv('train_mode_file.csv', index_col = 0)
df_test = pd.read_csv('test_mode_file.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


In [5]:
X_train = df_train.iloc[:, :-1]
Y_train = df_train.iloc[:, -1]

In [6]:
X_test = df_test.iloc[:, :-1]
Y_test = df_test.iloc[:, -1]

In [7]:
def rforest(X_train, Y_train, X_test):
    
    rforest = BaggingClassifier(base_estimator = RandomForestClassifier(n_estimators = 90))   
    rforest.fit(X_train, Y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [8]:
def knn(X_train, Y_train, X_test):
    
    knn = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 15))    
    knn.fit(X_train, Y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [9]:
def nb(X_train, Y_train, X_test):
    
    nb = BaggingClassifier(base_estimator = GaussianNB())
    nb.fit(X_train, Y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred

In [10]:
def lr(X_train, Y_train, X_test):
    
    lr = BaggingClassifier(base_estimator = LogisticRegression(C = 10))
    lr.fit(X_train, Y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [11]:
def svm(X_train, Y_train, X_test):
    svm = BaggingClassifier(base_estimator = SVC(C = 10, gamma = 0.01, kernel = 'rbf'))
    svm.fit(X_train, Y_train)
    y_pred = svm.predict(X_test)
    
    return y_pred

In [12]:
def ensemble(X_train, Y_train, X_test):
    r_y = rforest(X_train, Y_train, X_test)
    k_y = knn(X_train, Y_train, X_test)
    l_y = lr(X_train, Y_train, X_test)
    n_y = nb(X_train, Y_train, X_test)
    s_y = svm(X_train, Y_train, X_test)
    
    final_y = []
    
    for i in range(len(X_test)):
         final_y.append(stats.mode([r_y[i], k_y[i], l_y[i], n_y[i], s_y[i]])[0][0])
        
    return final_y

In [17]:
trial_run = ensemble(X_train, Y_train, X_test)



In [25]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_run, Y_test).ravel()[i])

tn 11236
fp 1559
fn 1199
tp 2287


In [26]:
accuracy_score(trial_run, Y_test)

0.8306000859898041

In [27]:
precision_score(trial_run, Y_test)

0.5946437857514301

In [28]:
recall_score(trial_run, Y_test)

0.6560527825588066

In [29]:
f1_score(trial_run, Y_test)

0.6238406983087834