In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import precision_recall_fscore_support, make_scorer, recall_score, f1_score, confusion_matrix, precision_score, balanced_accuracy_score
from sklearn.metrics import fbeta_score, classification_report


from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from libs.print_cm import print_cm
from libs.loader import load_malicious, load_jobchange
         

from sklearn.model_selection import learning_curve, validation_curve

In [2]:
#Some labels
ds_labels = {1:'dataset-01: Malicious Hack', 2:'dataset-02: Job Change'}

#Load the two datset
job_data = load_jobchange()
mal_data = load_malicious()


In [10]:


scorer_job = make_scorer(f1_score)
scorer_mal = scorer = make_scorer(recall_score)

adaboost_mal = AdaBoostClassifier(random_state=0)
adaboost_job = AdaBoostClassifier(random_state=0)




##  Hyperparam stuff

In [8]:
def eval_hyperparam(estimator, data, param_name='nothing', range_list=[], range_override=[], scorer=None):
    strat_cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    X,y = data
    
    dt_train_scores,dt_test_scores = validation_curve(
        estimator, X, y, param_name=param_name, param_range=range_list,
        scoring=scorer, cv=strat_cv, n_jobs=-1)
    
    train_scores_mean = np.mean(dt_train_scores, axis=1)
    test_scores_mean = np.mean(dt_test_scores, axis=1)
    x_vals = range_override  if len(range_override) > 0 else range_list
    
    df_1 = pd.DataFrame({'x':x_vals,'train':train_scores_mean,'test':test_scores_mean} )
#     print(df_1)
    return df_1
    

In [7]:
def plot(df_1, metric_name, x_label, title, ds_id, figsize = [7,5], show=False, log_x=False):
    fig = plt.figure(figsize=figsize, facecolor='w')
    axs = fig.add_subplot(111)
    axs.plot(df_1['x'], df_1['train'], label='Train', linewidth=0.75)
    axs.plot(df_1['x'], df_1['test'], label='Validation', linewidth=0.75)
    axs.legend(); axs.set_ylabel("Metric: %s"%metric_name); axs.set_xlabel(x_label)
    title_out = "AdaBoost Hyperparameter:\n%s\n%s"%(title, ds_labels[ds_id])
    axs.set_title(title_out)
    if show:
        plt.show()
    if log_x:
        axs.set_xscale('log')
        
    fig.savefig('plots/boost_%d_hyper_%s.png'%(ds_id,x_label),bbox_inches='tight')
    plt.close(fig=fig)

## Hypers


In [16]:


range_list_mal  = np.linspace(1, 175, 25).astype('int32')
range_list_job  = np.linspace(1, 3000, 15).astype('int32')
range_list_job

array([   1,  215,  429,  643,  857, 1072, 1286, 1500, 1714, 1928, 2143,
       2357, 2571, 2785, 3000], dtype=int32)

In [17]:
df_1 = eval_hyperparam(adaboost_mal, mal_data, param_name='n_estimators', range_list=range_list_mal, scorer=scorer_mal)
df_2 = eval_hyperparam(adaboost_job, job_data, param_name='n_estimators', range_list=range_list_job, scorer=scorer_job)
print("done")

done


In [19]:
plot(df_1, 'Recall', 'weak_learners', 'Weak Leaners', 1,show=False)
plot(df_2, 'F1 Score', 'weak_learners', 'Weak Learners', 2, show=False)

In [18]:
## Learning rage hyperparameter

In [20]:
range_learning_rate_mal = np.linspace(0.1, 4.5, 30)
range_learning_rate_job  = np.linspace(0.01, 10, 20)

In [21]:
df_3 = eval_hyperparam(adaboost_mal, mal_data, param_name='learning_rate', range_list=range_learning_rate_mal, scorer=scorer_mal)
df_4 = eval_hyperparam(adaboost_job, job_data, param_name='learning_rate', range_list=range_learning_rate_job, scorer=scorer_job)
print("done")

done


In [22]:
plot(df_3, 'Recall', 'learning_rate', 'Learning Rate', 1,show=False)
plot(df_4, 'F1 Score', 'learning_rate', 'Learning Rate', 2, show=False)