In [1]:
# quick peek for blending performance
import src.utilities as utils

import numpy as np
import torch
from pathlib import Path
from src.models.camerbert import Camembert
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import (accuracy_score, f1_score, matthews_corrcoef,
                             precision_score, recall_score, roc_auc_score)

configs = utils.read_config()
root = utils.get_project_root()


model_file = []
model = Camembert()
model.load_state_dict(torch.load(model_file))
model.to('cuda')

model_file = 
training = load_svmlight_file("../traindata_5.scl")
testing = load_svmlight_file("../testdata_5.scl")

In [2]:
X = training[0]
y = training[1]

X_test = testing[0]
y_test = testing[1]

In [3]:
def get_data(iii):
    data = load_svmlight_file("./traindata_"+str(iii)+".scl", zero_based=False)
    data1 = load_svmlight_file("./testdata_"+str(iii)+".scl", zero_based=False)
    return data[0].toarray(), data[1], data1[0].toarray(), data1[1]

In [4]:
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# use default parem to avoid overfitting
def get_models():
    models = list()
    models.append(('SVM', SVC()))
    models.append(('LGBM', LGBMClassifier()))
    models.append(('LR', LogisticRegression()))
    models.append(('RF', RandomForestClassifier()))
    models.append(('ET', ExtraTreesClassifier()))
    models.append(('NB' , GaussianNB()))
    return models


In [5]:
def fit_ensemble(models, X_train, X_val, y_train, y_val):
    meta = list()
    for _, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta.append(y_pred)
        meta = np.hstack(meta)
        blender = LogisticRegression()
        blender.fit(meta, y_val)
        return blender

def predict_ensemble(models, blender, X_test):
    meta = list()
    for _, model in models:
        y_pred = model.predict(X_test)
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta.append(y_pred)
        meta = np.hstack(meta)
        # predict
        return blender.predict(meta)
    
def predict_proba_ensemble(models, blender, X_test):
    meta = list()
    for _, model in models:
        y_pred = model.predict(X_test)
        y_pred = y_pred.reshape(len(y_pred), 1)
        meta.append(y_pred)
        meta = np.hstack(meta)
        return blender.predict_proba(meta)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)    

print('Train: %s, Val: %s, Test: %s' % (X_train.shape, X_val.shape, X_test.shape))    

Train: (1256, 50), Val: (315, 50), Test: (393, 50)


In [7]:
models = get_models()
blender = fit_ensemble(models, X_train, X_val, y_train, y_val)
p = predict_ensemble(models, blender, X_test)

pr = predict_proba_ensemble(models, blender, X_test)

In [8]:
# test with testing set
ACC = accuracy_score(y_test, p)
SENS = precision_score(y_test, p, average='macro')
SPEC = recall_score(y_test,p, average='macro')
MCC = matthews_corrcoef(y_test,p)
AUC = roc_auc_score(y_test, pr, multi_class='ovo',average='macro')
#AUC = roc_auc_score(test_y,pr[:,1]) # for binary classification problem
F1 = 2*SENS*SPEC/(SENS+SPEC)

print("ACC:", ACC, ", MCC:" , MCC, " ,F1:", F1)

ACC: 0.6590330788804071 , MCC: 0.4469576005784089  ,F1: 0.6456052905059707
