In [237]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier
from datetime import datetime
import hashlib

import yaml
import pickle
from statistics import mean
import random


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import warnings
warnings.filterwarnings('ignore')

In [239]:
# Changes to be made:
# 1. Add subgroups into model trainer class
# 2. "Functionize" model score saving

In [238]:
clf_fcns = {
    'DC': DummyClassifier(strategy="uniform"),
    'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42),
    'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
    'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
    'LR': LogisticRegression(random_state=42,solver='liblinear'),
    'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
    'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(random_state=42),
    'SGD': SGDClassifier(loss="hinge", penalty="l2"),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LRR': Ridge(alpha=5.0,fit_intercept=True,random_state=42)
    }

In [None]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, insert

POSTGRES_ADDRESS = 'cs003-ib0'
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'isk273'
POSTGRES_PASSWORD = 'andrewIan'
POSTGRES_DBNAME = 'explainability_db'

postgres_str = (
    f'postgresql://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}@{POSTGRES_ADDRESS}:{POSTGRES_PORT}/{POSTGRES_DBNAME}'
)

# Create the connection
engine = create_engine(postgres_str, echo = True)

In [230]:
class Trainer:
    def __init__(
        self,
        xid,
        X_name,
        X,
        y,
        clfs,
        hyperparameters,
        split_methods,
        split_random_states        
    ):
        self.xid = xid
        self.X_name = X_name
        self.X = X
        self.y = y
        self.split_random_states = split_random_states
        self.split_methods = split_methods
        self.set_clfs(clfs)
        self.set_hyperparemeters(hyperparameters)
        self.results_df = pd.DataFrame(columns=('xid','datetime','X_name','subgroup','split_method', 'split_seed', 'split_n', 'clf','params','metric','score'))
    
    # Check for presets
    def set_hyperparemeters(self, hyperparemeters):
        if hyperparemeters == 'small':
            self.hyperparemeters = small_grid
        elif hyperparemeters == 'large':
            self.hyperparemeters = large_grid
        elif hyperparemeters == 'test':
            self.hyperparemeters = test_grid
        else:
            self.hyperparemeters = hyperparemeters
    
    # Check for presets
    def set_clfs(self, clfs):
        if clfs == 'all':
            self.clfs = clf_list_all
        elif clfs =='test':
            self.clfs = clf_list_test
        else:
            self.clfs = clfs
            
    
            
    def train_models(self):
        
        # Evaluator?
        # Initialize the evaluator
        evaluator = ModelEvaluator(metrics='test')
        
        models_to_run = list(self.clfs.keys())
        grid = self.hyperparemeters
        for index,clf in enumerate([self.clfs[x] for x in models_to_run]):
            parameter_values = grid[models_to_run[index]]
            #i = 0
            for p in ParameterGrid(parameter_values):
                #i+=1
                #print(f"Training {models_to_run[index]}: {i} / {len(ParameterGrid(parameter_values))}")
                try:
                    clf.set_params(**p)
                    for r in self.split_random_states:
                        for split_method in self.split_methods:
                            if split_method.find('@') < 0:
                                raise ValueError('You must define a number of folds or percentage for the split functions.')
                            else:
                                split_function = split_method[0:split_method.find('@')]
                                n = int(split_method[split_method.find('@')+1:])
                                
                            if split_function == 'train_test_split':
                                X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=n/100, random_state=r)
                                clf.fit(X_train, y_train)
                                
                                evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                scores = evaluator.get_metrics()
                                now = datetime.now()
                                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                                
                                for score in scores:
                                    #Save an entry or each metric
                                    #self.results_df.loc[len(self.results_df)] = [self.xid,dt_string,self.X_name,'all',split_method, r, 0, models_to_run[index],p,score,scores[score]]                   
                                    ins = students.insert(
                                        values=dict(xid=self.xid,
                                                    datetime=dt_string,
                                                    dataset=self.X_name,
                                                    subgroup='all',
                                                    split_method=split_method,
                                                    split_random_state=r,
                                                    split_n=0,
                                                    clf=models_to_run[index],
                                                    params=p,
                                                    metric=score,
                                                    score=scores[score])
                                    )
                                    result = engine.execute(ins)
                                    
                                # Subgroup metrics
                                for subgroup_var in config['subgroups'][self.X_name]:
                                    for subgroup_value in list(X_test[subgroup_var].unique()):
                                        X_test_temp = X_test[X_test[subgroup_var]==subgroup_value]
                                        y_test_temp = y_test.loc[X_test[subgroup_var]==subgroup_value]

                                        evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                        scores = evaluator.get_metrics()
                                        now = datetime.now()
                                        dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                                        
                                        subgroup_string = subgroup_var + '=' + str(subgroup_value)
                                
                                        for score in scores:
                                            #Save an entry or each metric
                                            #self.results_df.loc[len(self.results_df)] = [self.xid,dt_string,self.X_name,subgroup_string,split_method, r, 0, models_to_run[index],p,score,scores[score]]                                                        
                                            ins = students.insert(
                                                values=dict(xid=self.xid,
                                                            datetime=dt_string,
                                                            dataset=self.X_name,
                                                            subgroup='all',
                                                            split_method=split_method,
                                                            split_random_state=r,
                                                            split_n=0,
                                                            clf=models_to_run[index],
                                                            params=p,
                                                            metric=score,
                                                            score=scores[score])
                                            )
                                            result = engine.execute(ins)
                                                                               
                            elif split_function=="StratifiedKFold" or split_function=="KFold":
                                
                                kf = eval(f'{split_function}(n_splits={n}, random_state={r}, shuffle=True)')
                                split_counter = 0
                                for train_index, test_index in kf.split(self.X,self.y):
                                    split_counter += 1
                                    X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                                    y_train, y_test = self.y[train_index], self.y[test_index]
                                    clf.fit(X_train, y_train)

                                    # Here we run the model evalutor and save the stats
                                    evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                    scores = evaluator.get_metrics()
                                    now = datetime.now()
                                    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")

                                    for score in scores:
                                        #Save an entry or each metric
                                        #self.results_df.loc[len(self.results_df)] = [self.xid,dt_string,self.X_name,'all',split_method, r, split_counter, models_to_run[index],p,score,scores[score]]                   
                                        ins = students.insert(
                                            values=dict(xid=self.xid,
                                                        datetime=dt_string,
                                                        dataset=self.X_name,
                                                        subgroup='all',
                                                        split_method=split_method,
                                                        split_random_state=r,
                                                        split_n=0,
                                                        clf=models_to_run[index],
                                                        params=p,
                                                        metric=score,
                                                        score=scores[score])
                                        )
                                        result = engine.execute(ins)
                                        
                                    # Subgroup metrics
                                    for subgroup_var in config['subgroups'][self.X_name]:
                                        for subgroup_value in list(X_test[subgroup_var].unique()):
                                            X_test_temp = X_test[X_test[subgroup_var]==subgroup_value]
                                            y_test_temp = y_test.loc[X_test[subgroup_var]==subgroup_value]

                                            evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                            scores = evaluator.get_metrics()
                                            now = datetime.now()
                                            dt_string = now.strftime("%d/%m/%Y %H:%M:%S")

                                            subgroup_string = subgroup_var + '=' + str(subgroup_value)

                                            for score in scores:
                                                #Save an entry or each metric
                                                #self.results_df.loc[len(self.results_df)] = [self.xid,dt_string,self.X_name,subgroup_string,split_method, r, 0, models_to_run[index],p,score,scores[score]]
                                                ins = students.insert(
                                                    values=dict(xid=self.xid,
                                                                datetime=dt_string,
                                                                dataset=self.X_name,
                                                                subgroup='all',
                                                                split_method=split_method,
                                                                split_random_state=r,
                                                                split_n=0,
                                                                clf=models_to_run[index],
                                                                params=p,
                                                                metric=score,
                                                                score=scores[score])
                                                )
                                                result = engine.execute(ins)
                                        
                except IndexError as e:
                    print('Error:',e)
                    continue

class ModelEvaluator:
    def __init__(
        self,
        clf=None,
        X_test=None,
        y_true=None,
        metrics='test'
    ):
        self.clf = clf
        self.X_test = X_test
        self.y_true = y_true
        self.set_metrics(metrics)
    
    def load(self,X_test,y_true,clf):
        self.X_test = X_test
        self.y_true = y_true
        self.clf = clf
        self.y_pred = self.clf.predict(self.X_test)
        self.y_score = self.clf.predict_proba(self.X_test)[:,1]

    # Check for presets
    def set_metrics(self, metrics):
        if metrics == 'small':
            self.metrics = metric_list_small
        elif metrics == 'test':
            self.metrics = metric_list_test
        else:
            self.metrics = metrics
            
    def metric_at_k(self, metric, k):
        y_pred = np.where(self.y_score > np.percentile(self.y_score,(100-k)), 1, 0)
        s = eval(metric + '(self.y_true,y_pred)')
        return s

    def get_metrics(self):
        results = {}
        for metric in self.metrics:
            if metric.find('@') > -1:
                m = metric[0:metric.find('@')]
                k = int(metric[metric.find('@')+1:])
                s = self.metric_at_k(metric=m,k=k)
            else:
                s = eval(metric + '(self.y_true,self.y_pred)')
            results[metric] = s
        return results

In [232]:
# Read json
with open('config.json') as f:
    config = json.load(f)

clfs = {}
for clf in config['hyperparameters']:
    clfs[clf] = clf_fcns[clf]

In [233]:
xid = hashlib.md5((str(config)+datetime.now().strftime("%d/%m/%Y %H:%M:%S")).encode('utf-8')).hexdigest()

'fe156ed0a50d3a2dbeb3b48bb76a2b2f'

In [235]:
for i in range(0,len(config['X'])):
    X = pd.read_csv(config['X'][i],index_col=0)
    y = pd.read_csv(config['y'][i],index_col=0,squeeze = True)
    trainer = Trainer(xid=xid,
                      X_name=config['X_name'][i],
                      X=X,
                      y=y,
                      clfs=clfs,
                      hyperparameters=config['hyperparameters'],
                      split_methods=config['split_methods'],
                      split_random_states=config['split_random_states'])
    trainer.train_models()