# Trainning, hypertunning and evaluating models

**Import** data from "data/featured.csv"

**Export** model to "model/v01"

In [2]:
# Setup

import pandas as pd
import numpy as np

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from pkg import utils
from pkg import feature as feat

In [3]:
# Importing data

path = utils.get_parents() + r"/data/featured.csv"

df = pd.read_csv(path, index_col=0)
df.head()

Unnamed: 0,bayley_3_t1,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,P108-PWY,...,delivery_mode,chaos_tot_t1,epds_2c_t1,bisq_3_mins_t1,bisq_4_mins_t1,bisq_9_mins_t1,bisq_sleep_prob_t1,ebia_tot_t1,educationLevelAhmedNum_t1,a10_t1
7,100.0,0.0,0.0,7.676483,0.0,0.0,0.0,4395.178167,0.0,171.940413,...,1.0,4.0,0.0,480.0,540.0,239.0,1.0,0.0,16.0,3.0
8,75.0,0.0,0.0,14.984719,0.0,320.494755,0.0,4478.753445,0.0,43.380158,...,1.0,1.0,0.0,570.0,240.0,60.0,0.0,0.0,20.0,1.0
14,95.0,0.0,209.593817,0.0,0.0,0.0,0.0,2755.525229,0.0,0.0,...,1.0,2.0,0.0,720.0,90.0,30.0,0.0,0.0,16.0,1.0
24,115.0,0.0,0.0,0.0,0.0,26.374059,0.0,2725.705501,0.0,50.36835,...,3.0,0.0,0.0,600.0,360.0,30.0,0.0,2.0,16.0,5.0
26,105.0,0.0,0.0,0.0,0.0,0.0,0.0,725.879153,0.0,0.0,...,1.0,7.0,0.0,420.0,480.0,120.0,1.0,0.0,12.0,2.0


In [4]:
# Parameters

# Taking value at quantile q
q = 0.2
seed = 123

# If true, all values less than quantile will return 1, others 0
less_than = True

In [5]:
runned1 = False
y_index = 0
if not runned1:
    df = feat.threshold_mapping(df, quantile=q, y_index=y_index, less_than=less_than)


print("Total number of trues : ", df.iloc[:,y_index].sum())

Total number of trues :  21


In [6]:
y = df.iloc[:,0]
X = df.iloc[:,1:]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [8]:
rf = RandomForestClassifier(max_depth=10, criterion='gini', random_state=seed)
rf.fit(X_train, y_train)

In [9]:
y_pred = rf.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.84


In [18]:
# Hyperparameter Tuning

param_dist = {'random_state': [seed], 'criterion':["gini"], 'max_depth':randint(1,50)}

rf = RandomForestClassifier()

rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=5)

rand_search.fit(X_train, y_train)

In [19]:
# Evaluating

best_rf = rand_search.best_estimator_
print('Best hyperparameters:', rand_search.best_params_)


y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Best hyperparameters: {'criterion': 'gini', 'max_depth': 1, 'random_state': 123}
Accuracy:  0.84


In [21]:
# Exporting data

model = best_rf # Add final df
filename = "v01.pkl"

run_save = False
if not run_save:
  utils.save_model(model, filename)
  run_save = True

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\ptons\\Code\\repositories\\brainwise\\model\\v01.pkl'