In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [6]:
df_train = pd.read_csv('train_mode_file.csv', index_col = 0)
df_test = pd.read_csv('test_mode_file.csv', index_col = 0)

In [7]:
len(df_train.columns) == len(df_test.columns)

True

In [8]:
print(len(df_train), len(df_test))

32561 16281


In [9]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

In [11]:
def rforest(X_train, y_train, X_test):

    rforest = RandomForestClassifier()
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [12]:
def knn(X_train, y_train, X_test):
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [13]:
def nb(X_train, y_train, X_test):
    
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred

In [14]:
def lr(X_train, y_train, X_test):

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [15]:
def svm(X_train, y_train, X_test):
    svm = SVC()
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

In [16]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [17]:
def ensemble(X_train, y_train, X_test):
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
        
    return (r_y, k_y, l_y, n_y, s_y)

In [18]:
rf_pred, knn_pred, lr_pred, nb_pred, svm_pred = ensemble(X_train, y_train, X_test)



In [22]:
#Ensemble 1 with pearson coefficient 
print(pearson(rf_pred, lr_pred), #RF and LR
      pearson(lr_pred, knn_pred), #LR and KNN
      pearson(rf_pred, knn_pred)) #RF and KNN

0.5930685983723457 0.6206311863239945 0.6704896273448653


In [20]:
#Ensemble 2 with pearson coefficient
print(pearson(rf_pred, nb_pred), #RF and NB
      pearson(nb_pred, knn_pred), #NB and KNN
      pearson(rf_pred, knn_pred)) #RF and KNN

0.41173259165191306 0.4324767649063429 0.6704896273448653


In [23]:
#Ensemble 2 with pearson coefficient
print(pearson(svm_pred, nb_pred), #SVM and NB
      pearson(svm_pred, lr_pred), #highest accuracy when doing ensemble using pearson coefficient with svm + lr = 84.39%
      pearson(svm_pred, knn_pred), #SVM and KNN
      pearson(svm_pred, rf_pred)) # SVM and RF

0.46402116800760773 0.8438885603188111 0.6653449646056273 0.6023234590583493
