# Dependencies

In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Perceptron, SGDClassifier, OrthogonalMatchingPursuit
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier

import yaml
import pickle
from statistics import mean
import random


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import warnings
warnings.filterwarnings('ignore')

# Imports

In [6]:
X = pd.read_csv("matrices/por_X.csv",index_col=0)
X.head()

Unnamed: 0,absences,failures,grade1,traveltime,studytime,famrel,freetime,goout,Dalc,Walc,health,activities,nursery,internet,romantic,school_gp,address_urban,famsize_gt3,Pstatus_togethor,sex_female,health_mjob,other_mjob,services_mjob,teacher_mjob,health_fjob,other_fjob,services_fjob,teacher_fjob,home_reason,other_reason,reputation_reason,mother_guardian,other_guardian
0,12,0,15,1,1,3,2,1,1,4,5,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
1,2,0,17,1,2,5,3,4,1,3,3,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0
2,0,2,9,2,1,4,4,5,3,5,5,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0
3,4,0,14,2,2,4,4,3,1,1,3,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0
4,2,0,12,2,1,4,3,2,1,4,5,0,1,0,0,1,1,1,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0


In [7]:
y = pd.read_csv("matrices/por_y.csv",index_col=0,squeeze = True)
y.head()

0    0
1    0
2    0
3    0
4    0
Name: G3, dtype: int64

# Training Grid

In [8]:
clfs = {
    'DC': DummyClassifier(strategy="uniform"),
    'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
    'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
    'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
    'LR': LogisticRegression(penalty='l1', C=1e5,solver='liblinear'),
    'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
    'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'SGD': SGDClassifier(loss="hinge", penalty="l2"),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'LRR': Ridge(alpha=5.0,fit_intercept=True)
        }

large_grid = {
    'DC': {'strategy': ["stratified","most_frequent","prior","uniform"]},
    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'LRR': {'alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0], 'fit_intercept':[True,False]}
           }

small_grid = {
    'DC': {'stragey': ["stratified","most_frequent","prior","uniform"]},
    'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
    'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
    'NB' : {},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'LRR': {'alpha': [0.01,0.1,1.0,10.0,100.0,1000.0,10000.0], 'fit_intercept':[True,False]}       
        }

# Helper functions

In [9]:
def precision_at_k(y_true,y_pred_proba,k):
    y_pred = np.where(y_pred_proba > np.percentile(y_pred_proba,(100-k)), 1, 0)
    return precision_score(y_true,y_pred)

# Baseline

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
random_clf =  [random.uniform(0, 1) for i in range(0,len(y_test))]
precision_at_k(y_test,random_clf,25)

0.22857142857142856

# Magic Loops

In [11]:
def magic_loop(models_to_run,random_states,grid):
    results_df =  pd.DataFrame(columns=('model_type', 'parameters', 'metric'))
    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        parameter_values = grid[models_to_run[index]]
        i = 0
        for p in ParameterGrid(parameter_values):
            i+=1
            print(f"Training {models_to_run[index]}: {i} / {len(ParameterGrid(parameter_values))}")
            scores = []
            try:
                clf.set_params(**p)
                for r in random_states:
                    skf = StratifiedKFold(n_splits=5, random_state=r, shuffle=True)
                    for train_index, test_index in skf.split(X,y):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y[train_index], y[test_index]
                        if models_to_run[index] in ('LRR'):
                            y_pred_proba = clf.fit(X_train, y_train).predict(X_test)
                        elif models_to_run[index] in ('LR','RF','DC'):
                            y_pred_proba = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                        scores.append(precision_at_k(y_test,y_pred_proba,25))
                results_df.loc[len(results_df)] = [models_to_run[index], p,
                                                   mean(scores)]
            except IndexError as e:
                print('Error:',e)
                continue
    return results_df

In [13]:
results_df = magic_loop(models_to_run = ['RF','LR'],random_states=[42, 333, 729],grid=small_grid)

Training RF: 1 / 16
Training RF: 2 / 16
Training RF: 3 / 16
Training RF: 4 / 16
Training RF: 5 / 16
Training RF: 6 / 16
Training RF: 7 / 16
Training RF: 8 / 16
Training RF: 9 / 16
Training RF: 10 / 16
Training RF: 11 / 16
Training RF: 12 / 16
Training RF: 13 / 16
Training RF: 14 / 16
Training RF: 15 / 16
Training RF: 16 / 16
Training LR: 1 / 10
Training LR: 2 / 10
Training LR: 3 / 10
Training LR: 4 / 10
Training LR: 5 / 10
Training LR: 6 / 10
Training LR: 7 / 10
Training LR: 8 / 10
Training LR: 9 / 10
Training LR: 10 / 10


In [14]:
results_df

Unnamed: 0,model_type,parameters,metric
0,RF,"{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 10, 'n_jobs': -1}",0.647436
1,RF,"{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}",0.689744
2,RF,"{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 10, 'n_jobs': -1}",0.661538
3,RF,"{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1}",0.691026
4,RF,"{'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 10, 'n_jobs': -1}",0.658974
5,RF,"{'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}",0.687179
6,RF,"{'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 10, 'n_jobs': -1}",0.662821
7,RF,"{'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1}",0.689744
8,RF,"{'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 10, 'n_jobs': -1}",0.658017
9,RF,"{'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}",0.667949


In [20]:
parameters = results_df[results_df['metric'] == results_df.metric.max()].iloc[0]['parameters']
results_df[results_df['metric'] == results_df.metric.max()]

Unnamed: 0,model_type,parameters,metric
19,LR,"{'C': 0.001, 'penalty': 'l2'}",0.901878


# Save model files

## Retrain model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
k=25

In [19]:
rf = RandomForestClassifier(**parameters).fit(X_train, y_train)

NameError: name 'parameters' is not defined

In [17]:
filename = 'results/rf.sav'
pickle.dump(rf, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)

NameError: name 'rf' is not defined

In [41]:
#Hyperparemeter YAML

#rf_hyperparameters = {'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1}

#with open(r'rf_hyperparameters.yaml', 'w') as file:
#    documents = yaml.dump(rf_hyperparameters, file)

# for reading

#with open(r'E:\data\fruits.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
#    fruits_list = yaml.load(file, Loader=yaml.FullLoader)