In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle

In [2]:
main_df = pd.read_csv("preprocessed_files_from_db/preprocessed_data.csv")

In [3]:
main_df

Unnamed: 0,qty_slash_url,length_url,qty_dot_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_and_directory,qty_space_directory,...,qty_plus_params,qty_asterisk_params,qty_dollar_params,qty_percent_params,qty_params,asn_ip,time_domain_activation,ttl_hostname,cluster_labels,labels
0,1,25,2,1,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,60781,-1,892,2,1
1,3,223,2,3,0,0,3,0,0,0,...,0,0,0,0,3,36024,579,9540,2,1
2,1,15,2,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,4766,-1,589,0,0
3,5,81,2,2,0,2,5,0,0,0,...,-1,-1,-1,-1,-1,20454,-1,292,0,1
4,0,19,2,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,53831,6998,3597,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88642,0,23,3,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,8560,5509,3597,0,0
88643,0,34,2,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,26496,5046,591,0,0
88644,5,70,1,1,1,0,5,0,0,0,...,-1,-1,-1,-1,-1,394695,1844,14391,1,1
88645,1,28,2,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,47583,-1,52,2,1


In [3]:
clusters = main_df.cluster_labels.unique()

In [4]:
clusters

array([2, 0, 1])

In [5]:
cluster_dic = {}
for cluster in clusters: 
    cluster_dic["df_"+str(cluster)] = pd.DataFrame(main_df[main_df.cluster_labels==cluster])

In [6]:
cluster_dic["df_0"].drop("cluster_labels", inplace=True, axis=1)

In [7]:
cluster_dic["df_1"].drop("cluster_labels", inplace=True, axis=1)

In [8]:
cluster_dic["df_2"].drop("cluster_labels", inplace=True, axis=1)

In [9]:
with open("objects/selected_cols.obj", 'rb') as f: 
    selected_cols = pickle.load(f)

In [17]:
models_params_dict = {
    "rf_classifier": {
        "object": RandomForestClassifier(), 
        "params": {
            "criterion": ["gini", "entropy"],
            "n_estimators": [100, 150, 200],
            "max_depth": [20, 40, 60], 
            "oob_score": [True],
            "random_state": [32],
            
        }
    }, 
    "gb_classifier": {
        "object": GradientBoostingClassifier(), 
        "params": {
            "loss": ["exponential"],
            "n_estimators": [100, 150, 200],
            "criterion": ["friedman_mse", "squared_error"], 
            "max_depth": [3, 6, 9], 
            "random_state": [32], 
            
        }
    }
}

In [20]:
result = {
    
}

for cluster_no, cluster_df in cluster_dic.items(): 
   
    for classifier in models_params_dict: 
        print(classifier)
        grid_search_cv = GridSearchCV(models_params_dict[classifier].get("object"), models_params_dict[classifier].get("params"), n_jobs=15)
        grid_search_cv.fit(cluster_df[(selected_cols)], cluster_df.labels)
        if not result.get(cluster_no): 
            result[cluster_no] = {}
        
        if not result.get(cluster_no).get(classifier):  
            result[cluster_no][classifier] = {}
            
        result[cluster_no][classifier]["best_params"] = grid_search_cv.best_params_
        result[cluster_no][classifier]["best_score"] = grid_search_cv.best_score_
        print(result)

rf_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}}}
gb_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}, 'gb_classifier': {'best_params': {'criterion': 'squared_error', 'loss': 'exponential', 'max_depth': 9, 'n_estimators': 200, 'random_state': 32}, 'best_score': 0.9732282055473099}}}
rf_classifier
{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy', 'max_depth': 40, 'n_estimators': 200, 'oob_score': True, 'random_state': 32}, 'best_score': 0.9703979754033891}, 'gb_classifier': {'best_params': {'criterion': 'squared_error', 'loss': 'exponential', 'max_depth': 9, 'n_estimators': 200, 'random_state': 32}, 'best_score': 0.9732282055473099}}, 'df_0': {'rf_classifier': {'best_params': {'criterio

In [21]:
result

{'df_2': {'rf_classifier': {'best_params': {'criterion': 'entropy',
    'max_depth': 40,
    'n_estimators': 200,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9703979754033891},
  'gb_classifier': {'best_params': {'criterion': 'squared_error',
    'loss': 'exponential',
    'max_depth': 9,
    'n_estimators': 200,
    'random_state': 32},
   'best_score': 0.9732282055473099}},
 'df_0': {'rf_classifier': {'best_params': {'criterion': 'gini',
    'max_depth': 60,
    'n_estimators': 200,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9664167206138641},
  'gb_classifier': {'best_params': {'criterion': 'squared_error',
    'loss': 'exponential',
    'max_depth': 9,
    'n_estimators': 200,
    'random_state': 32},
   'best_score': 0.9694727449945951}},
 'df_1': {'rf_classifier': {'best_params': {'criterion': 'entropy',
    'max_depth': 20,
    'n_estimators': 150,
    'oob_score': True,
    'random_state': 32},
   'best_score': 0.9637229430910921},

In [None]:
# for dataset0 -? random forest with 32random state, max_depth:60 criterion: gini, n_estimator: 200
# for dataset1 --> gradient boosting with max_depth 6, n_estimator: 150, random_State: 32, loss: exponential, criterion: friedman mse
# for dataset2 --> Gradient boosting with criterion: squared_error, loss; exponential, nax_depth: 9, n_estimator: 200, 

In [10]:
# Testing and Training dataset for cluster0 dataset
y1 = cluster_dic["df_0"].labels
cluster_dic["df_0"].drop("labels", inplace=True, axis=1)
X1 = cluster_dic["df_0"]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,
                                                        y1,
                                                        test_size=0.2,
                                                        stratify=y1
                                                       )

In [11]:
# Testing and Training dataset for cluster1 dataset
y2 = cluster_dic["df_1"].labels
cluster_dic["df_1"].drop("labels", inplace=True, axis=1)
X2 = cluster_dic["df_1"]
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,
                                                        y2,
                                                        test_size=0.2,
                                                        stratify=y2
                                                       )

In [12]:
# Testing and Training dataset for cluster2 dataset
y3 = cluster_dic["df_2"].labels
cluster_dic["df_2"].drop("labels", inplace=True, axis=1)
X3 = cluster_dic["df_2"]
X3_train, X3_test, y3_train, y3_test = train_test_split(X3,
                                                        y3,
                                                        test_size=0.2,
                                                        stratify=y3
                                                       )

In [47]:
# model building for dataset0(cluster 0)

gradient_boosting_df0 = GradientBoostingClassifier(max_depth=9, criterion="squared_error", n_estimators=200, random_state=32, loss="exponential")
gradient_boosting_df0.fit(X1_train, y1_train)

In [48]:
# model building for dataset1(cluster 1)

gradient_boosting_df2 = GradientBoostingClassifier(max_depth=6, criterion="friedman_mse", n_estimators=150, random_state=32, loss="exponential")
gradient_boosting_df2.fit(X2_train, y2_train)

In [49]:
# model building for dataset2(cluster 2)

random_forest = RandomForestClassifier(max_depth=40, criterion="entropy", n_estimators=200, random_state=32)
random_forest.fit(X3_train, y3_train)

In [50]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score, cohen_kappa_score

## Cluster0 evaluation metrics..

In [52]:
y1_pred = gradient_boosting_df0.predict(X1_test)

In [54]:
confusion_matrix(y1_test, y1_pred)

array([[7922,  200],
       [ 203, 3652]])

In [55]:
accuracy_score(y1_test, y1_pred)

0.9663521750020874

In [56]:
balanced_accuracy_score(y1_test, y1_pred)

0.9613583193523156

In [58]:
print(classification_report(y1_test, y1_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      8122
           1       0.95      0.95      0.95      3855

    accuracy                           0.97     11977
   macro avg       0.96      0.96      0.96     11977
weighted avg       0.97      0.97      0.97     11977



## Cluster1 evaluation metrics..

In [59]:
y2_pred = gradient_boosting_df2.predict(X2_test)

In [60]:
confusion_matrix(y2_test, y2_pred)

array([[226,  19],
       [  5, 274]])

In [61]:
accuracy_score(y2_test, y2_pred)

0.9541984732824428

In [62]:
balanced_accuracy_score(y2_test, y2_pred)

0.9522639163192159

In [63]:
print(classification_report(y2_test, y2_pred))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95       245
           1       0.94      0.98      0.96       279

    accuracy                           0.95       524
   macro avg       0.96      0.95      0.95       524
weighted avg       0.96      0.95      0.95       524



## Cluster2 evaluation metrics..

In [64]:
y3_pred = random_forest.predict(X3_test)

In [65]:
confusion_matrix(y3_test, y3_pred)

array([[3148,   86],
       [  62, 1934]])

In [66]:
accuracy_score(y3_test, y3_pred)

0.9717017208413002

In [67]:
balanced_accuracy_score(y3_test, y3_pred)

0.9711727102938097

In [68]:
print(classification_report(y3_test, y3_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3234
           1       0.96      0.97      0.96      1996

    accuracy                           0.97      5230
   macro avg       0.97      0.97      0.97      5230
weighted avg       0.97      0.97      0.97      5230



In [14]:
# actual training of the model.

cluster0_model = GradientBoostingClassifier(max_depth=9, criterion="squared_error", n_estimators=200, random_state=32, loss="exponential")
cluster0_model.fit(X1, y1)

cluster1_model = GradientBoostingClassifier(max_depth=6, criterion="friedman_mse", n_estimators=150, random_state=32, loss="exponential")
cluster1_model.fit(X2, y2)

cluster2_model = RandomForestClassifier(max_depth=40, criterion="entropy", n_estimators=200, random_state=32)
cluster2_model.fit(X3, y3)

In [15]:
with open("objects/cluster0_model.obj", 'wb') as f: 
    pickle.dump(cluster0_model, f)

with open("objects/cluster1_model.obj", 'wb') as f: 
    pickle.dump(cluster1_model, f)
    
with open("objects/cluster2_model.obj", 'wb') as f: 
    pickle.dump(cluster2_model, f)