In [1]:
import re
import pickle
import time
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.datasets import load_iris, load_breast_cancer
from lightgbm import LGBMClassifier


In [2]:
X, y = load_breast_cancer(return_X_y=True)

In [3]:
X.shape

(569, 30)

In [4]:
y.shape

(569,)

In [5]:

def build_model(X, y, model, save=False) :
    """Splits data into train and unseen data, 
    builds given model and trains on train data and predicts on validation data
    
    Parameters : 
    X : A Pandas DataFrame or a numpy array which has the predictors
    y : A Pandas Series or a numpy array which is the target class
    model : sklearn class already defined
    save : A boolean parameter to whether save a model or not
        Default = False
    Returns :
    Trained Model
    """
    
    start_time = time.clock()
    print("Splitting the data")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=64, stratify=y)
    print("shapes of the data after splitting")
    print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
    print("Training")
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    print("Training accuracy: ", accuracy_score(y_train, pred_train))
    print("Validation accuracy: ", accuracy_score(y_val, pred_val))
    print("------------------------")
    print("Training f1: ", f1_score(y_train, pred_train))
    print("Validation f1: ", f1_score(y_val, pred_val))
    print("------------------------")
    print("Training Recall", recall_score(y_train, pred_train))
    print("Validation Recall", recall_score(y_val, pred_val))
    print("-------------------------")
    print("Training Precision", precision_score(y_train, pred_train))
    print("Validation Precision", precision_score(y_val, pred_val))
    if save == True :
        joblib.dump(model+".joblib")
    print("Finished building model in " + str(round((time.clock()-start_time)/60, 2)) + " minutes")
    return model

def hyperparam_tuning(X_, y_, clf, params, search, k=5, scoring = 'f1', save=False, lgbm=False, feat_imp=False, 
                     n_imp = 25, feat_imp_model = 'same') :
    """Splits data into train and unseen data, 
    Performs HyperParameter Tuning search for the given model  
    
    Parameters : 
    X_ : A Pandas DataFrame or a numpy array which has the predictors
    y_ : A Pandas Series or a numpy array which is the target class
    params : A dictionary with keys as parameters of model and values as list of different values that can be given
    search : String object. If 'grid', performs GridSearchCV, else RandomSearchCV
    k : int number, how many times cross validation should be performed.
    scoring : str which says which score to optimize for
    save : A boolean which asks whether to save the model. Default = False
    lgbm : A boolean which asks whether model is LGBM. Default = False
    feat_imp : A boolean which asks whether to take feature importance of th model to be taken
    n_imp : int, Number of features taken for feat imp
    feat_imp_model : A boolean whether to decide whether the model after feature importance should be the same or new

    Returns :
    Trained Model
    """
    start_time = time.clock()
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=64)
    if lgbm == True :
        if type(X_) == pd.DataFrame :
            X_ = X_.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
        #X_tr_lgbm, X_val_lgbm, y_tr_lgbm, y_val_lgbm = train_test_split(X_lgbm, y, test_size=0.25, random_state=64, stratify = y)
    if search == 'grid' :
        grid_search = GridSearchCV(clf, params,scoring=scoring, cv=kfold)
    else :
        grid_search = RandomizedSearchCV(clf, params,scoring='f1', cv=kfold)
    print("Starting Grid Search")
    print("---------------------")
    grid_result = grid_search.fit(X_, y_)
    print("Finished Grid Search in " + str(round((time.clock()-start_time)/60, 2)) + " minutes")
    print("---------------------")
    print("Best Model :")
    print("---------------------")
    print(grid_result.best_estimator_)
    print("Best Score :", grid_result.best_score_)
    best_clf = grid_result.best_estimator_
    print("---------------------")
    print("Training and Validating with Best model")
    print("---------------------")
    model_ = build_model(X_, y_, model=best_clf)
    if feat_imp == True :
        feat_imp, top_n_feats = get_top_n_features(model_, X_, n_imp)
        X_n_feats = X[top_n_feats]
        if feat_imp_model != 'same' :
            model_ = feat_imp_model
        model_n_feats = build_model(X_n_feats, y, model_, save)
        return model_n_feats
    else :
        return model_

In [6]:
params = {'max_depth' : [2, 3, 5, 6, 10], 
          'n_estimators' : [10, 20, 50, 100, 300, 500]}
clf = RandomForestClassifier(class_weight='balanced', random_state=64)
rf_best = hyperparam_tuning(X, y, clf, params, 'random', scoring='f1', lgbm = False)



Starting Grid Search
---------------------


  from ipykernel import kernelapp as app


Finished Grid Search in 0.24 minutes
---------------------
Best Model :
---------------------
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=64, verbose=0,
                       warm_start=False)
Best Score : 0.9708588589851604
---------------------
Training and Validating with Best model
---------------------
Splitting the data
shapes of the data after splitting
(426, 30) (143, 30) (426,) (143,)
Training
Training accuracy:  1.0
Validation accuracy:  0.965034965034965
------------------------
Training f1:  1.0
Validation f1:  0.9726775956284154
--

