# Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier

import yaml
import pickle
from statistics import mean
import random


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import warnings
warnings.filterwarnings('ignore')

## Input

In [2]:
X_por = pd.read_csv("matrices/por_X.csv",index_col=0)
y_por = pd.read_csv("matrices/por_y.csv",index_col=0,squeeze = True)

In [3]:
meta = {'name':"", # dataset name
        'X': "", # X matrix (features)
        'y': "", # y matrix (target)
        'eval_metrics': ['all'], # list of evaluation metrics
        'fairness_attributes': [{'gender': 'female'}], # format [{attribute: protected_group}]
        'fairness_metrics': ['all'], # list of fairness metrics
        'clfs': ['all'], # e.g. [{'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1)}],
        'hyperparemters': ['small'], # e.g. [{'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]}}]
        'db': None, # Database object
        'random_states': [42]
       }

In [4]:
clf_list_test = {
    #'DC': DummyClassifier(),
    #'RF': RandomForestClassifier(n_jobs=-1),
    'LR': LogisticRegression(solver='liblinear'),
    'DT': DecisionTreeClassifier(),
        }

test_grid = {
    'DC': {'strategy': ["most_frequent","uniform"]},
    'RF':{'n_estimators': [1,10], 'max_depth': [1,5], 'max_features': ['sqrt','log2'], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.1, 1, 10]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,5], 'max_features': ['sqrt','log2'],},
           }

metric_list_test = ['roc_auc_score','accuracy_score','accuracy_score@25','precision_score@10']

In [272]:
# Class Trainer, PerformanceEvaluator, FairnessEvaluator?


'''
Issues:
1. Random state(s) input is awkward
2. Add validation_method which allows the user to pick simple w/ percentages, kfold, stratifiedkfold, repeated kfold
'''
class Trainer:
    def __init__(
        self,
        X,
        y,
        clfs,
        hyperparameters,
        split_methods,
        split_random_states        
    ):
        self.X = X
        self.y = y
        self.split_random_states = split_random_states
        self.split_methods = split_methods
        self.set_clfs(clfs)
        self.set_hyperparemeters(hyperparameters)
        self.results_df = pd.DataFrame(columns=('split_method', 'split_seed', 'split_n', 'clf','params','metric','score'))
    
    # Check for presets
    def set_hyperparemeters(self, hyperparemeters):
        if hyperparemeters == 'small':
            self.hyperparemeters = small_grid
        elif hyperparemeters == 'large':
            self.hyperparemeters = large_grid
        elif hyperparemeters == 'test':
            self.hyperparemeters = test_grid
        else:
            self.hyperparemeters = hyperparemeters
    
    # Check for presets
    def set_clfs(self, clfs):
        if clfs == 'all':
            self.clfs = clf_list_all
        elif clfs =='test':
            self.clfs = clf_list_test
        else:
            self.clfs = clfs
            
    def train_models(self):
        
        # Evaluator?
        # Initialize the evaluator
        evaluator = ModelEvaluator(metrics='test')
        
        models_to_run = list(self.clfs.keys())
        grid = self.hyperparemeters
        for index,clf in enumerate([self.clfs[x] for x in models_to_run]):
            parameter_values = grid[models_to_run[index]]
            #i = 0
            for p in ParameterGrid(parameter_values):
                #i+=1
                #print(f"Training {models_to_run[index]}: {i} / {len(ParameterGrid(parameter_values))}")
                try:
                    clf.set_params(**p)
                    for r in self.split_random_states:
                        for split_method in self.split_methods:
                            if split_method.find('@') < 0:
                                raise ValueError('You must define a number of folds or percentage for the split functions.')
                            else:
                                split_function = split_method[0:split_method.find('@')]
                                n = int(split_method[split_method.find('@')+1:])
                                
                            if split_function == 'train_test_split':
                                X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=n/100, random_state=r)
                                clf.fit(X_train, y_train)
                                
                                evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                scores = evaluator.get_metrics()

                                for score in scores:
                                    #Save an entry or each metric
                                    self.results_df.loc[len(self.results_df)] = [split_method, r, split_counter, models_to_run[index],p,score,scores[score]]                   
                                
                            elif split_function=="StratifiedKFold" or split_function=="KFold":
                                
                                kf = eval(f'{split_function}(n_splits={n}, random_state={r}, shuffle=True)')
                                split_counter = 0
                                for train_index, test_index in kf.split(self.X,self.y):
                                    split_counter += 1
                                    X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                                    y_train, y_test = self.y[train_index], self.y[test_index]
                                    clf.fit(X_train, y_train)

                                    # Here we run the model evalutor and save the stats
                                    evaluator.load(X_test=X_test, y_true=y_test, clf=clf)
                                    scores = evaluator.get_metrics()

                                    for score in scores:
                                        #Save an entry or each metric
                                        self.results_df.loc[len(self.results_df)] = [split_method, r, split_counter, models_to_run[index],p,score,scores[score]]                   
    
                except IndexError as e:
                    print('Error:',e)
                    continue

'''

'''
class ModelEvaluator:
    def __init__(
        self,
        clf=None,
        X_test=None,
        y_true=None,
        metrics='test'
    ):
        self.clf = clf
        self.X_test = X_test
        self.y_true = y_true
        self.set_metrics(metrics)
    
    def load(self,X_test,y_true,clf):
        self.X_test = X_test
        self.y_true = y_true
        self.clf = clf
        self.y_pred = self.clf.predict(self.X_test)
        self.y_score = self.clf.predict_proba(self.X_test)[:,1]

    # Check for presets
    def set_metrics(self, metrics):
        if metrics == 'small':
            self.metrics = metric_list_small
        elif metrics == 'test':
            self.metrics = metric_list_test
        else:
            self.metrics = metrics
            
    def metric_at_k(self, metric, k):
        y_pred = np.where(self.y_score > np.percentile(self.y_score,(100-k)), 1, 0)
        s = eval(metric + '(self.y_true,y_pred)')
        return s

    def get_metrics(self):
        results = {}
        for metric in self.metrics:
            if metric.find('@') > -1:
                m = metric[0:metric.find('@')]
                k = int(metric[metric.find('@')+1:])
                s = self.metric_at_k(metric=m,k=k)
            else:
                s = eval(metric + '(self.y_true,self.y_pred)')
            results[metric] = s
        return results

In [275]:
trainer = Trainer(X=X_por,y=y_por,clfs='test',hyperparameters='test',split_methods=['StratifiedKFold@5','StratifiedKFold@10','train_test_split@25'],split_random_states=[42])
trainer.train_models()

In [276]:
trainer.results_df

Unnamed: 0,split_method,split_seed,split_n,clf,params,metric,score
0,StratifiedKFold@5,42,1,LR,"{'C': 0.1, 'penalty': 'l1'}",roc_auc_score,0.709856
1,StratifiedKFold@5,42,1,LR,"{'C': 0.1, 'penalty': 'l1'}",accuracy_score,0.851675
2,StratifiedKFold@5,42,1,LR,"{'C': 0.1, 'penalty': 'l1'}",accuracy_score@25,0.866029
3,StratifiedKFold@5,42,1,LR,"{'C': 0.1, 'penalty': 'l1'}",precision_score@10,0.809524
4,StratifiedKFold@5,42,2,LR,"{'C': 0.1, 'penalty': 'l1'}",roc_auc_score,0.745532
5,StratifiedKFold@5,42,2,LR,"{'C': 0.1, 'penalty': 'l1'}",accuracy_score,0.870813
6,StratifiedKFold@5,42,2,LR,"{'C': 0.1, 'penalty': 'l1'}",accuracy_score@25,0.827751
7,StratifiedKFold@5,42,2,LR,"{'C': 0.1, 'penalty': 'l1'}",precision_score@10,0.904762
8,StratifiedKFold@5,42,3,LR,"{'C': 0.1, 'penalty': 'l1'}",roc_auc_score,0.815484
9,StratifiedKFold@5,42,3,LR,"{'C': 0.1, 'penalty': 'l1'}",accuracy_score,0.894737


In [None]:
# Functions for getting out the average scores

# Imports

In [5]:
X = pd.read_csv("matrices/por_X.csv",index_col=0)
X.head()

Unnamed: 0,absences,failures,grade1,traveltime,studytime,famrel,freetime,goout,Dalc,Walc,health,activities,nursery,internet,romantic,school_gp,address_urban,famsize_gt3,Pstatus_togethor,sex_female,health_mjob,other_mjob,services_mjob,teacher_mjob,health_fjob,other_fjob,services_fjob,teacher_fjob,home_reason,other_reason,reputation_reason,mother_guardian,other_guardian
0,12,0,15,1,1,3,2,1,1,4,5,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
1,2,0,17,1,2,5,3,4,1,3,3,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0
2,0,2,9,2,1,4,4,5,3,5,5,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0
3,4,0,14,2,2,4,4,3,1,1,3,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0
4,2,0,12,2,1,4,3,2,1,4,5,0,1,0,0,1,1,1,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0


In [6]:
y = pd.read_csv("matrices/por_y.csv",index_col=0,squeeze = True)
y.head()

0    0
1    0
2    0
3    0
4    0
Name: G3, dtype: int64

# Training Grid

In [7]:
clfs = {
    'DC': DummyClassifier(strategy="uniform"),
    'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
    'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
    'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
    'LR': LogisticRegression(penalty='l1', C=1e5,solver='liblinear'),
    'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
    'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'SGD': SGDClassifier(loss="hinge", penalty="l2"),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LRR': Ridge(alpha=5.0,fit_intercept=True)
        }

large_grid = {
    'DC': {'strategy': ["stratified","most_frequent","prior","uniform"]},
    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'LRR': {'alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0], 'fit_intercept':[True,False]}
           }

small_grid = {
    'DC': {'stragey': ["stratified","most_frequent","prior","uniform"]},
    'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'LRR': {'alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0], 'fit_intercept':[True,False]}       
        }

In [8]:
large_grid = {
    'DC': {'strategy': ["stratified","most_frequent","prior","uniform"]},
    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'class_weight': [None, 'balanced']},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,5],'min_samples_split': [2,5,10], 'max_features': ['sqrt','log2'],},
    'LRR': {'alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0], 'fit_intercept':[True,False], 'normalize': [True,False]},
    'XGB': {'learning_rate' : [.05, .1, .15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.96, 1.0],'n_estimators': [1,10,100,1000],'max_depth': [2,3,5,7,9,15], 'reg_alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0]}
           }

# Helper functions

In [9]:
def precision_at_k(y_true,y_pred_proba,k):
    y_pred = np.where(y_pred_proba > np.percentile(y_pred_proba,(100-k)), 1, 0)
    return precision_score(y_true,y_pred)

# Baseline

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
random_clf =  [random.uniform(0, 1) for i in range(0,len(y_test))]
precision_at_k(y_test,random_clf,25)

0.22857142857142856

# Magic Loops

In [None]:
class pipeline:
    

In [11]:
def magic_loop(models_to_run,random_states,grid):
    results_df =  pd.DataFrame(columns=('model_type', 'parameters', 'metric'))
    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        parameter_values = grid[models_to_run[index]]
        i = 0
        for p in ParameterGrid(parameter_values):
            i+=1
            print(f"Training {models_to_run[index]}: {i} / {len(ParameterGrid(parameter_values))}")
            scores = []
            try:
                clf.set_params(**p)
                for r in random_states:
                    skf = StratifiedKFold(n_splits=5, random_state=r, shuffle=True)
                    for train_index, test_index in skf.split(X,y):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y[train_index], y[test_index]
                        if models_to_run[index] in ('LRR'):
                            y_pred_proba = clf.fit(X_train, y_train).predict(X_test)
                        elif models_to_run[index] in ('LR','RF','DC'):
                            y_pred_proba = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                        scores.append(precision_at_k(y_test,y_pred_proba,25))
                results_df.loc[len(results_df)] = [models_to_run[index], p,
                                                   mean(scores)]
            except IndexError as e:
                print('Error:',e)
                continue
    return results_df

In [13]:
results_df = magic_loop(models_to_run = ['RF','LR'],random_states=[42, 333, 729],grid=small_grid)

Training RF: 1 / 16
Training RF: 2 / 16
Training RF: 3 / 16
Training RF: 4 / 16
Training RF: 5 / 16
Training RF: 6 / 16
Training RF: 7 / 16
Training RF: 8 / 16
Training RF: 9 / 16
Training RF: 10 / 16
Training RF: 11 / 16
Training RF: 12 / 16
Training RF: 13 / 16
Training RF: 14 / 16
Training RF: 15 / 16
Training RF: 16 / 16
Training LR: 1 / 10
Training LR: 2 / 10
Training LR: 3 / 10
Training LR: 4 / 10
Training LR: 5 / 10
Training LR: 6 / 10
Training LR: 7 / 10
Training LR: 8 / 10
Training LR: 9 / 10
Training LR: 10 / 10


In [134]:
results_df

NameError: name 'results_df' is not defined

In [135]:
parameters = results_df[results_df['metric'] == results_df.metric.max()].iloc[0]['parameters']
results_df[results_df['metric'] == results_df.metric.max()]

NameError: name 'results_df' is not defined

# Save model files

## Retrain model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
k=25

In [19]:
rf = RandomForestClassifier(**parameters).fit(X_train, y_train)

NameError: name 'parameters' is not defined

In [17]:
filename = 'results/rf.sav'
pickle.dump(rf, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)

NameError: name 'rf' is not defined

In [41]:
#Hyperparemeter YAML

#rf_hyperparameters = {'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}

#with open(r'rf_hyperparameters.yaml', 'w') as file:
#    documents = yaml.dump(rf_hyperparameters, file)

# for reading

#with open(r'E:\data\fruits.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
#    fruits_list = yaml.load(file, Loader=yaml.FullLoader)