In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, roc_auc_score, roc_curve, auc
from pathlib import Path
import pickle
from pathlib import Path
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data_path = Path.cwd().parent / "data" / "index feature extract.csv"
df = pd.read_csv(data_path)
df.rename(columns={"result": "result_prediction_output"}, inplace=True)
df.head()

Unnamed: 0,rec_id,url,website,result_prediction_output,created_date,tld,url_len,is_domain_IP,no_of_sub_domain,no_of_obfuscated_chars,...,has_title,has_description,has_external_form_submit,has_faviocn,no_of_images,no_of_js,has_password_field,has_copyright_info,has_hidden_field,no_financial_terms
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32,info,34,0,0,0,...,1,0,0,1,4,1,0,0,0,0
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38,com,40,0,1,0,...,1,0,0,0,2,4,0,0,1,0
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48,com,48,0,1,0,...,1,1,0,1,5,8,0,1,0,1
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02,com,52,0,1,0,...,1,1,0,1,19,6,0,0,0,3
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42,org.uk,33,0,1,0,...,1,1,0,1,12,17,0,0,1,1


In [3]:
model_path = Path.cwd().parent / "models" / "tfidf_vectorizer.pkl"
with open(model_path, 'rb') as file:
    vectorizer = pickle.load(file)

In [4]:
def content_vectorizer(file_name: str):
    file_path = Path.cwd().parent / "data" / "html data" / file_name
    text = file_path.read_text(encoding="utf-8", errors='ignore')
    soup = BeautifulSoup(text, "lxml")
    document = soup.get_text(separator=" ", strip=True).lower()
    vector = vectorizer.transform([document])
    return vector.toarray()

In [5]:
tfidf_matrix = np.vstack(Parallel(n_jobs=-1)(delayed(content_vectorizer)(site) for site in df['website']))
tfidf_feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_feature_names)
df = pd.concat([df, tfidf_df], axis=1)



In [6]:
df.head()

Unnamed: 0,rec_id,url,website,result_prediction_output,created_date,tld,url_len,is_domain_IP,no_of_sub_domain,no_of_obfuscated_chars,...,well,window,within,without,word,work,world,would,year,youre
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32,info,34,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38,com,40,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48,com,48,0,1,0,...,0.0,0.067924,0.049462,0.0,0.0,0.143686,0.0,0.0,0.0,0.0
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02,com,52,0,1,0,...,0.035387,0.0,0.0,0.0,0.0,0.032638,0.250778,0.035038,0.0,0.0
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42,org.uk,33,0,1,0,...,0.064988,0.0,0.0,0.0,0.0,0.239758,0.0,0.064346,0.0,0.0


In [7]:
X = df.drop(['result', 'rec_id', 'url', 'created_date', 'website'], axis=1)
y = df['result_prediction_output']

In [8]:
num_columns = X.select_dtypes(include='number').columns
print(num_columns)
cat_columns = X.select_dtypes(exclude='number').columns
print(cat_columns)

num_pipe = Pipeline([
    ('scalar', StandardScaler())
])
cat_pipe = Pipeline([
    ('Ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scalar', StandardScaler())
])

ct = ColumnTransformer([
    ('numeric', num_pipe, num_columns),
    ('categoric', cat_pipe, cat_columns)
])

Index(['result_prediction_output', 'url_len', 'is_domain_IP',
       'no_of_sub_domain', 'no_of_obfuscated_chars', 'is_https', 'no_equal',
       'no_qmark', 'no_amp', 'has_title',
       ...
       'well', 'window', 'within', 'without', 'word', 'work', 'world', 'would',
       'year', 'youre'],
      dtype='object', length=416)
Index(['tld'], dtype='object')


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
X_train_norm = ct.fit_transform(X_train)
X_test_norm = ct.transform(X_test)

In [11]:
ct_store_path = Path.cwd().parent / "models" / "column_transformer.pkl"
with open(ct_store_path, 'wb') as file:
    pickle.dump(ct, file)

In [12]:
# tfidf_train = np.vstack(Parallel(n_jobs=-1)(delayed(content_vectorizer)(site) for site in X_train['website']))
# tfidf_test = np.vstack(Parallel(n_jobs=-1)(delayed(content_vectorizer)(site) for site in X_test['website']))

# X_train_combined = np.hstack([X_train_norm, tfidf_train])
# X_test_combined = np.hstack([X_test_norm, tfidf_test])
# X_train_combined.shape, X_test_combined.shape

# Training ML model

In [13]:
estimators = [
    ('bnb', BernoulliNB()),
    ('pa', PassiveAggressiveClassifier())
]

stack_model = StackingClassifier(
    estimators= estimators,
    final_estimator= LogisticRegression(),
    passthrough=True,
)

In [14]:
param_grid = {
    'bnb__alpha': [0.1, 0.5, 1.0, 5.0],
    'bnb__binarize': [0.0, 0.5, 1.0, None],
    'bnb__fit_prior': [True, False],
    'pa__C': [0.01, 0.1, 1, 10],
    'pa__loss': ['hinge', 'squared_hinge'],
    'pa__max_iter': [50, 100, 200],
    'pa__tol': [1e-4, 1e-3, 1e-2],
    'pa__shuffle': [True, False],
    'pa__fit_intercept': [True, False],
    'final_estimator__C': [0.1, 1.0, 10, 100],
    'final_estimator__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'final_estimator__solver': ['liblinear', 'lbfgs', 'saga', 'newton-cg'],
    'final_estimator__max_iter': [100, 1000, 2500],
    'final_estimator__tol': [1e-4, 1e-3, 1e-2],
    'final_estimator__fit_intercept': [True, False],
    'final_estimator__class_weight': [None, 'balanced'],
}

cv = StratifiedKFold()

# grid_search = GridSearchCV(stack_model, param_grid=param_grid, n_jobs=-1, cv=cv)
random_search = RandomizedSearchCV(stack_model, param_distributions=param_grid, n_jobs=-1, cv=cv)

In [15]:
# grid_search.fit(X_train_norm, y_train)
random_search.fit(X_train_norm, y_train)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loa

0,1,2
,estimator,StackingClass...sthrough=True)
,param_distributions,"{'bnb__alpha': [0.1, 0.5, ...], 'bnb__binarize': [0.0, 0.5, ...], 'bnb__fit_prior': [True, False], 'final_estimator__C': [0.1, 1.0, ...], ...}"
,n_iter,10
,scoring,
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,

0,1,2
,C,0.1
,fit_intercept,False
,max_iter,50
,tol,0.001
,early_stopping,False
,validation_fraction,0.1
,n_iter_no_change,5
,shuffle,True
,verbose,0
,loss,'squared_hinge'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.01
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [23]:
pred = random_search.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 1.0
f1_score: 1.0
precision_score: 1.0
confusion_matrix:
[[10000     0]
 [    0  6000]]
classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

roc_auc_score: 1.0


In [17]:
dt = DecisionTreeClassifier()
dt.fit(X_train_norm, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
pred = dt.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 1.0
f1_score: 1.0
precision_score: 1.0
confusion_matrix:
[[10000     0]
 [    0  6000]]
classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

roc_auc_score: 1.0


In [19]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2']
}


cv = StratifiedKFold()

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, n_jobs=-1, cv=cv, verbose=3)
grid_search.fit(X_train_norm, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV 3/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5;, score=1.000 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5;, score=1.000 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2;, score=1.000 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5;, score=1.000 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5;, score=1.

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


[CV 3/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10;, score=1.000 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10;, score=1.000 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10;, score=1.000 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10;, score=1.000 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10;, score=1.000 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=5, min_samples_split=2;, score=1.000 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=5, min_samples_split=2;, score=1.000 total time=   0.5s
[CV 3/5] END criterion=gini, max_dep

0,1,2
,estimator,DecisionTreeClassifier()
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [None, 3, ...], 'max_features': [None, 'sqrt', ...], 'min_samples_leaf': [1, 2, ...], ...}"
,scoring,
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [25]:
pred = grid_search.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 1.0
f1_score: 1.0
precision_score: 1.0
confusion_matrix:
[[10000     0]
 [    0  6000]]
classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

roc_auc_score: 1.0


In [26]:
xgbc = XGBClassifier()
xgbc.fit(X_train_norm, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [27]:
pred = xgbc.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 1.0
f1_score: 1.0
precision_score: 1.0
confusion_matrix:
[[10000     0]
 [    0  6000]]
classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

roc_auc_score: 1.0


In [28]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2],
    'scale_pos_weight': [1, 3, 5]
}

rand_search = RandomizedSearchCV(estimator=XGBClassifier(), param_distributions=param_grid, scoring='roc_auc', cv=5, verbose=1,n_jobs=-1)
rand_search.fit(X_train_norm, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': [0.7, 0.8, ...], 'gamma': [0, 0.1, ...], 'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 5, ...], ...}"
,n_iter,10
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [29]:
pred = rand_search.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 1.0
f1_score: 1.0
precision_score: 1.0
confusion_matrix:
[[10000     0]
 [    0  6000]]
classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10000
           1       1.00      1.00      1.00      6000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

roc_auc_score: 1.0
