# Functions file
This is the file i have stored all my functions in and run in the supervised_learning_assignment notebook using %run.

Author: Rients Dalstra <br>
All functions are written by me, however during my bachelor thesis I also compared the performance of 5 different classifiers against each other and I have copied some of the functions from then.

In [10]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats
import eli5
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier

pd.set_option('display.max_columns', 50)

In [11]:
# function created to split and scale the data.
def datasplit(X, y, size=0.8, scale=True):
    
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, train_size=size, random_state=42)

    if scale==True:
        xScaler = StandardScaler()
        xScaler.fit(Xtrain)
        Xtrain = xScaler.transform(Xtrain)
        Xtest = xScaler.transform(Xtest)
    
    return Xtrain,Xtest,Ytrain,Ytest

In [12]:
# Functie used to perform and display permutation importance.
def permutation_importance(X, y,input_model):

    Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)
    
    model, params = input_model 
    model.fit(Xtrain,Ytrain)

    perm = eli5.sklearn.PermutationImportance(model, random_state=42).fit(Xtrain, Ytrain)
    display(eli5.show_weights(perm,feature_names = X.columns.tolist()))

In [13]:
# Function created to obtain some metrics for the rfc model to see if it makes sense to use permutation importance
def accuracy_scores_dct(X,y,k):

    score={}    
    Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)

    model, params = dct_model()
    model.fit(Xtrain,Ytrain)
    y_pred = [i for i in model.predict(Xtest)]         
    # accuracy: (tp + tn) / (p + n)
    accuracy = metrics.accuracy_score(Ytest, y_pred)
    # precision tp / (tp + fp)
    precision = metrics.precision_score(Ytest, y_pred)     
    # recall: tp / (tp + fn)
    recall = metrics.recall_score(Ytest, y_pred)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = metrics.f1_score(Ytest, y_pred)
    score['X'+k] = {'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1':f1}
    display(pd.DataFrame.from_dict(score))

In [14]:
# functie om te zien welke features het beste werken in vergelijking tot elkaar.
def feature_importance(X, y,input_model): 

    Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)
    
    model, params = input_model 
    model.fit(Xtrain,Ytrain)
    
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)

    # De grafiek plotten met nlargest beste features
    plt.figure(figsize=(15,5))
    ax = plt.gca()
    feat_importances.nlargest(20).plot(kind='bar', color=['green','blue'],edgecolor='black',linewidth= 3, ax=ax)
    plt.grid(visible=None, which='major', axis='y')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('The best Features')
    plt.show()  

In [15]:
# RandomizedSearch
def train_RandomizedSearch(model, params, Xtrain, Ytrain, iter_nr=100, jobs=-1, scored='f1'):
    randomizedsearch = RandomizedSearchCV(estimator = model, param_distributions = params, n_iter=iter_nr, n_jobs=jobs,scoring=scored)
    randomizedsearch.fit(Xtrain,Ytrain)
    return randomizedsearch.cv_results_["mean_test_score"], randomizedsearch.best_params_, randomizedsearch.cv_results_["std_test_score"]

In [19]:
# Function to obtain best hyperparameters
def hyperpara_check(X, y, input_model, iter_nr=100, scored='f1'): 

    Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)
    
    model, params = input_model 
    
    meanscore, best_params, stdscore = train_RandomizedSearch(model, params, Xtrain, Ytrain, iter_nr=iter_nr, scored=scored)
    
    return best_params

In [17]:
def accuracy_scores(X,y, models_list=1):
    fig = plt.figure(figsize=(10,5))
    if models_list == 1:
        model_lijst = ['rfc','svm','mlpc','gbc','lr','dct','kmeans']
    else:
        model_lijst = ['rfc','lr','svm','gbc']
    score={}    
    for mod in model_lijst:
            Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)
            if mod == "rfc" :
                model, params = rfc_model()
            elif mod == "gbc":                              
                model, params = gbc_model()
            elif mod == "mlpc":
                model, params = mlpc_model()
            elif mod == "lr":
                model, params = lr_model()
            elif mod == "svm":
                model, params = svm_model()
            elif mod == "dct":
                model, params = dct_model()
            elif mod == "kmeans":
                model, params = kmeans_model()
                
                
            model.fit(Xtrain,Ytrain)
            
            #voorspelling voor elk model in de lijst
            y_pred = [i for i in model.predict(Xtest)]        
            
            precision_curve, recall_curve, thresholds2 = metrics.precision_recall_curve(Ytest, model.predict_proba(Xtest)[:,1])
                
            #precision recall
            ap = metrics.average_precision_score(Ytest, y_pred)
            
            #Grafiek maken
            plt.plot(precision_curve, recall_curve, label='%s PR (area = %0.2f)' % (mod, ap))
            
            
            plt.plot([0, 1], [0, 1],'r--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Precision')
            plt.ylabel('Recall')
            plt.title('Precision Recall Curve')
            plt.legend(loc="upper left")
            

                
            # accuracy: (tp + tn) / (p + n)
            accuracy = metrics.accuracy_score(Ytest, y_pred)
            # precision tp / (tp + fp)
            precision = metrics.precision_score(Ytest, y_pred)     
            # recall: tp / (tp + fn)
            recall = metrics.recall_score(Ytest, y_pred)
            # f1: 2 tp / (2 tp + fp + fn)
            f1 = metrics.f1_score(Ytest, y_pred)
            # kappa, lastig uit te leggen.
            kappa = metrics.cohen_kappa_score(Ytest, y_pred)
            # ROC AUC
            auc = metrics.roc_auc_score(Ytest, y_pred)
            
            # confusion dataset
            confusion = metrics.confusion_matrix(Ytest, y_pred)

            score[mod] = {'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1':f1,'ROC AUC':auc,'Kappa':kappa}

    plt.show()
    return pd.DataFrame.from_dict(score)

In [18]:
def accuracy_scores2(X,y, models_list=1):
    fig = plt.figure(figsize=(10,5))
    if models_list == 1:
        model_lijst = ['rfc','svm','mlpc','gbc','lr','dct','kmeans']
    else:
        model_lijst = ['rfc','lr','svm','gbc']
    score={}    
    for mod in model_lijst:
            Xtrain, Xtest, Ytrain, Ytest = datasplit(X, y)
            if mod == "rfc" :
                model, params = rfc_model2()
            elif mod == "gbc":                              
                model, params = gbc_model2()
            elif mod == "mlpc":
                model, params = mlpc_model2()
            elif mod == "lr":
                model, params = lr_model2()
            elif mod == "svm":
                model, params = svm_model2()
            elif mod == "dct":
                model, params = dct_model()
            elif mod == "kmeans":
                model, params = kmeans_model2()
                
                
            model.fit(Xtrain,Ytrain)
            
            #voorspelling voor elk model in de lijst
            y_pred = [i for i in model.predict(Xtest)]        
            
            precision_curve, recall_curve, thresholds2 = metrics.precision_recall_curve(Ytest, model.predict_proba(Xtest)[:,1])
                
            #precision recall
            ap = metrics.average_precision_score(Ytest, y_pred)
            
            #Grafiek maken
            plt.plot(precision_curve, recall_curve, label='%s PR (area = %0.2f)' % (mod, ap))
            
            
            plt.plot([0, 1], [0, 1],'r--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Precision')
            plt.ylabel('Recall')
            plt.title('Precision Recall Curve')
            plt.legend(loc="upper left")
            

                
            # accuracy: (tp + tn) / (p + n)
            accuracy = metrics.accuracy_score(Ytest, y_pred)
            # precision tp / (tp + fp)
            precision = metrics.precision_score(Ytest, y_pred)     
            # recall: tp / (tp + fn)
            recall = metrics.recall_score(Ytest, y_pred)
            # f1: 2 tp / (2 tp + fp + fn)
            f1 = metrics.f1_score(Ytest, y_pred)
            # kappa, lastig uit te leggen.
            kappa = metrics.cohen_kappa_score(Ytest, y_pred)
            # ROC AUC
            auc = metrics.roc_auc_score(Ytest, y_pred)
            
            # confusion dataset
            confusion = metrics.confusion_matrix(Ytest, y_pred)

            score[mod] = {'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1':f1,'ROC AUC':auc,'Kappa':kappa}

    plt.show()
    return pd.DataFrame.from_dict(score)