In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
main_df = pd.read_csv("preprocessed_files_from_db/preprocessed_data.csv")

In [3]:
main_df

Unnamed: 0,qty_slash_url,length_url,qty_dot_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_and_directory,qty_space_directory,...,qty_plus_params,qty_asterisk_params,qty_dollar_params,qty_percent_params,qty_params,asn_ip,time_domain_activation,ttl_hostname,cluster_labels,labels
0,1,25,2,1,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,60781,-1,892,2,1
1,3,223,2,3,0,0,3,0,0,0,...,0,0,0,0,3,36024,579,9540,2,1
2,1,15,2,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,4766,-1,589,0,0
3,5,81,2,2,0,2,5,0,0,0,...,-1,-1,-1,-1,-1,20454,-1,292,0,1
4,0,19,2,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,53831,6998,3597,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88642,0,23,3,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,8560,5509,3597,0,0
88643,0,34,2,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,26496,5046,591,0,0
88644,5,70,1,1,1,0,5,0,0,0,...,-1,-1,-1,-1,-1,394695,1844,14391,1,1
88645,1,28,2,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,47583,-1,52,2,1


In [4]:
clusters = main_df.cluster_labels.unique()

In [5]:
clusters

array([2, 0, 1])

In [6]:
cluster_dic = {}
for cluster in clusters: 
    cluster_dic["df_"+str(cluster)] = pd.DataFrame(main_df[main_df.cluster_labels==cluster])

In [7]:
cluster_dic["df_0"].drop("cluster_labels", inplace=True, axis=1)

In [8]:
cluster_dic["df_1"].drop("cluster_labels", inplace=True, axis=1)

In [9]:
cluster_dic["df_2"].drop("cluster_labels", inplace=True, axis=1)

In [10]:
with open("objects/selected_cols.obj", 'rb') as f: 
    selected_cols = pickle.load(f)

In [17]:
models_params_dict = {
    "rf_classifier": {
        "object": RandomForestClassifier(), 
        "params": {
            "criterion": ["gini", "entropy"],
            "n_estimators": [100, 150, 200],
            "max_depth": [20, 40, 60], 
            "oob_score": [True],
            "random_state": [32],
            
        }
    }, 
    "gb_classifier": {
        "object": GradientBoostingClassifier(), 
        "params": {
            "loss": ["exponential"],
            "n_estimators": [100, 150, 200],
            "criterion": ["friedman_mse", "squared_error"], 
            "max_depth": [3, 6, 9], 
            "random_state": [32], 
            
        }
    }
}

In [20]:
result = {
    
}

for cluster_no, cluster_df in cluster_dic.items(): 
   
    for classifier in models_params_dict: 
        print(classifier)
        grid_search_cv = GridSearchCV(models_params_dict[classifier].get("object"), models_params_dict[classifier].get("params"), n_jobs=15)
        grid_search_cv.fit(cluster_df[(selected_cols)], cluster_df.labels)
        if not result.get(cluster_no): 
            result[cluster_no] = {}
        
        if not result.get(cluster_no).get(classifier):  
            result[cluster_no][classifier] = {}
            
        result[cluster_no][classifier]["best_params"] = grid_search_cv.best_params_
        result[cluster_no][classifier]["best_score"] = grid_search_cv.best_score_
        print(result)

rf_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}}}
gb_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}, 'gb_classifier': {'best_params': {'criterion': 'squared_error', 'loss': 'exponential', 'max_depth': 9, 'n_estimators': 200, 'random_state': 32}, 'best_score': 0.9732282055473099}}}
rf_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}, 'gb_classifier': {'best_params': {'criterion': 'squared_error', 'loss': 'exponential', 'max_depth': 9, 'n_estimators': 200, 'random_state': 32}, 'best_score': 0.9732282055473099}}, 'df_0': {'rf_classifier': {'best_params': {'criterio

In [21]:
result

{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy',
    'max_depth': 40,
    'n_estimators': 200,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9703979754033891},
  'gb_classifier': {'best_params': {'criterion': 'squared_error',
    'loss': 'exponential',
    'max_depth': 9,
    'n_estimators': 200,
    'random_state': 32},
   'best_score': 0.9732282055473099}},
 'df_0': {'rf_classifier': {'best_params': {'criterion': 'gini',
    'max_depth': 60,
    'n_estimators': 200,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9664167206138641},
  'gb_classifier': {'best_params': {'criterion': 'squared_error',
    'loss': 'exponential',
    'max_depth': 9,
    'n_estimators': 200,
    'random_state': 32},
   'best_score': 0.9694727449945951}},
 'df_1': {'rf_classifier': {'best_params': {'criterion': 'entropy',
    'max_depth': 20,
    'n_estimators': 150,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9637229430910921},

In [None]:
# for dataset0 -? random forest with 32random state, max_depth:60 criterion: gini, n_estimator: 200
# for dataset1 --> gradient boosting with max_depth 6, n_estimator: 150, random_State: 32, loss: exponential, criterion: friedman mse
# for dataset2 --> Gradient boosting with criterion: squared_error, loss; exponential, nax_depth: 9, n_estimator: 200, 