In [2]:
#Importing all packages needed 
#1) Fundatmental 
import pandas as pd
import numpy as np

#2) Preprocessing 
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

#3) Model 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

#4) Validation  
from sklearn.model_selection import LeaveOneOut

#5) Performance metrics  
from sklearn.metrics import roc_auc_score, confusion_matrix,accuracy_score,roc_curve,auc

In [1]:
def scale_data(scaler,train,test):
    #if no scaler is specified the data is untouched
    if scaler is None:
        train,test=train,test
    #if the scaler is MinMaxScaler then the MinMaxScaler is applied to the data
    elif scaler==MinMaxScaler:
        #Calling the scaler
        scaler=preprocessing.MinMaxScaler()
        #Fitting it to the training data
        train=scaler.fit_transform(train)
        #Transforming the test data based on the scaling computed from the training data
        test=scaler.transform(test)
    return train,test

def impute_data(imputer,train,test):
    if imputer is None:
        train,test=train,test
    elif imputer==KNNImputer:
        imputer_1 = KNNImputer(n_neighbors=5)
        train=imputer_1.fit_transform(train)
        test = imputer_1.transform(test)
    elif imputer==SimpleImputer:
        imputer_1 = SimpleImputer(missing_values=np.nan, strategy='mean')
        train=imputer_1.fit_transform(train)
        test = imputer_1.transform(test)
    return train,test
    
def smote_data(smote_type,cat_indx,X_train, y_train):
    if smote_type is None:
        X_train, y_train=X_train, y_train 
    elif smote_type==SMOTE:
        oversample = SMOTE(random_state=9)
        X_train, y_train = oversample.fit_resample(X_train, y_train)    
    elif smote_type==SMOTENC:
        oversample = SMOTENC(categorical_features= cat_indx, random_state=0)
        X_train, y_train = oversample.fit_resample(X_train, y_train)    
    return X_train,y_train

def data_prep(data_model):
    factor = pd.factorize(data_model['biodiag'])
    data_model.biodiag = factor[0]
    y = np.array(data_model.loc[:, data_model.columns == "biodiag"])
    X = data_model.loc[:, data_model.columns != "biodiag"]
    coulmnNames=list(X.columns)
    X=np.array(X)
    return X,y,coulmnNames

def sentivity_specificity(y_true,y_pred):
    #Compute the four components of the cofucion matrix: true negative, false positive, false negative, true positive.
    tn, fp, fn, tp  = confusion_matrix(y_true, y_pred).ravel()
    #Compute the sensitivity,a measure of how well a test can identify true positives. 
    sensitivity=tp/(tp+fn)
    #Compute the specificity, a measure of how well a test can identify true negatives.
    specificity=tn/(tn + fp)
    return sensitivity,specificity


def print_importance(featureImportance,coulmnNames):
    importance=pd.DataFrame(featureImportance, columns = ['importance'])
    importance["feature"]=coulmnNames
    importance=importance[["feature","importance"]]
    print(importance.sort_values(by=['importance'],ascending=False))
    

def featureimportance(X,y,model,scaler,imputer):
    #define the split mechanism as leave one out for evaluation 
    cv = LeaveOneOut()
    model=model
    #Create three lists to store the true y values (real and predicted) and the importance values for each iteration of leave one out validation
    y_true, y_pred, importancelist = list(), list(), list()
    #Run the leave one out 
    for train_ix, test_ix in cv.split(X):
        # split data, using just one sample as the test 
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]
        #Scale the data if called in the definition, where "None" means no scaling is applied
        X_train,X_test=scale_data(scaler,X_train,X_test)
        #Impute missing values using the technique indicated where "knn" is k-nearest neighbor and "mean" is the mean.
        X_train,X_test=impute_data(imputer,X_train,X_test)
        #Fit the model
        model.fit(X_train, y_train)
        #Evaluate the model
        yhat = model.predict(X_test)
        #Store the predicted and true values of "y"
        y_true.append(y_test[0])
        y_pred.append(yhat[0])
        #Compute the feature importance 
        importance = model.feature_importances_
        #Store the feature importance 
        importancelist.append(importance)
        
    # calculate accuracy
    acc = accuracy_score(y_true, y_pred)
    fpr,tpr,threshold=roc_curve(y_true,y_pred,pos_label=1)
    roc_auc = auc(fpr, tpr)
    #Compute importance as the mean of all leave one out importance scores
    importance=np.mean(importancelist, axis=0)
    #Compute the four components of the cofucion matrix: true negative, false positive, false negative, true positive.
    sensitivity,specificity=sentivity_specificity(y_true, y_pred)
    return acc, roc_auc, importance,sensitivity,specificity


def LOOCV(X,y,model,scaler,imputer):
    #define the split mechanism as leave one out for evaluation 
    cv = LeaveOneOut()
    model=model
    #Create three lists to store the true y values (real and predicted) and the importance values for each iteration of leave one out validation
    y_true, y_pred = list(), list()
    #Run the leave one out 
    for train_ix, test_ix in cv.split(X):
        # split data, using just one sample as the test 
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]
        #Scale the data if called in the definition, where "None" means no scaling is applied
        X_train,X_test=scale_data(scaler,X_train,X_test)
        #Impute missing values using the technique indicated where "knn" is k-nearest neighbor and "mean" is the mean.
        X_train,X_test=impute_data(imputer,X_train,X_test)
        #Fit the model
        model.fit(X_train, y_train)
        #Evaluate the model
        yhat = model.predict(X_test)
        #Store the predicted and true values of "y"
        y_true.append(y_test[0])
        y_pred.append(yhat[0])
        
    # calculate accuracy
    acc = accuracy_score(y_true, y_pred)
    fpr,tpr,threshold=roc_curve(y_true,y_pred,pos_label=1)
    roc_auc = auc(fpr, tpr)
    #Compute the four components of the cofucion matrix: true negative, false positive, false negative, true positive.
    sensitivity,specificity=sentivity_specificity(y_true, y_pred)
    return acc, roc_auc,sensitivity,specificity




