In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, roc_auc_score, recall_score
from pathlib import Path
import pickle
from pathlib import Path
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve

In [2]:
data_path = Path.cwd().parent / "data" / "index feature extract.csv"
df = pd.read_csv(data_path)
df.rename(columns={"result": "result_prediction_output"}, inplace=True)
df.head()

Unnamed: 0,rec_id,url,website,result_prediction_output,created_date,tld,url_len,is_domain_IP,no_of_sub_domain,no_of_obfuscated_chars,...,has_description,has_external_form_submit,has_favicon,no_of_images,no_of_js,has_password_field,has_copyright_info,has_hidden_field,no_financial_terms,has_submit_button
0,1,http://intego3.info/EXEL/index.php,1613573972338075.html,1,2021-02-17 20:29:32,info,34,0,0,0,...,0,0,1,4,1,0,0,0,0,0
1,2,https://www.mathopenref.com/segment.html,1635698138155948.html,0,2021-10-31 16:35:38,com,40,0,1,0,...,0,0,0,2,4,0,0,1,0,0
2,3,https://www.computerhope.com/issues/ch000254.htm,1635699228889266.html,0,2021-10-31 16:53:48,com,48,0,1,0,...,1,0,1,5,8,0,1,0,1,0
3,4,https://www.investopedia.com/terms/n/next-elev...,1635750062162701.html,0,2021-11-01 12:31:02,com,52,0,1,0,...,1,0,1,19,6,0,0,0,3,1
4,5,https://jobs.emss.org.uk/lcc.aspx,161356510250721.html,0,2021-02-17 18:01:42,org.uk,33,0,1,0,...,1,0,1,12,17,0,0,1,1,0


In [3]:
# model_path = Path.cwd().parent / "models" / "tfidf_vectorizer.pkl"
# with open(model_path, 'rb') as file:
#     vectorizer = pickle.load(file)

In [4]:
# def content_vectorizer(file_name: str):
#     file_path = Path.cwd().parent / "data" / "html data" / file_name
#     text = file_path.read_text(encoding="utf-8", errors='ignore')
#     soup = BeautifulSoup(text, "lxml")
#     document = soup.get_text(separator=" ", strip=True).lower()
#     vector = vectorizer.transform([document])
#     return vector.toarray()

In [5]:
# tfidf_matrix = np.vstack(Parallel(n_jobs=-1)(delayed(content_vectorizer)(site) for site in df['website']))
# tfidf_feature_names = vectorizer.get_feature_names_out()

# tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_feature_names)
# df = pd.concat([df, tfidf_df], axis=1)

In [6]:
X = df.drop(['result_prediction_output', 'rec_id', 'url', 'created_date', 'website'], axis=1)
y = df['result_prediction_output']

In [7]:
num_columns = X.select_dtypes(include='number').columns
print(num_columns)
cat_columns = X.select_dtypes(exclude='number').columns
print(cat_columns)

num_pipe = Pipeline([
    ('scalar', StandardScaler())
])
cat_pipe = Pipeline([
    ('Ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scalar', StandardScaler())
])

ct = ColumnTransformer([
    ('numeric', num_pipe, num_columns),
    ('categoric', cat_pipe, cat_columns)
])

Index(['url_len', 'is_domain_IP', 'no_of_sub_domain', 'no_of_obfuscated_chars',
       'is_https', 'no_equal', 'no_qmark', 'no_amp', 'no_dot', 'no_underlines',
       'no_exclamation', 'no_tilde', 'no_vowels', 'has_title',
       'has_description', 'has_external_form_submit', 'has_favicon',
       'no_of_images', 'no_of_js', 'has_password_field', 'has_copyright_info',
       'has_hidden_field', 'no_financial_terms', 'has_submit_button'],
      dtype='object')
Index(['tld'], dtype='object')


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
X_train_norm = ct.fit_transform(X_train)
X_test_norm = ct.transform(X_test)

In [10]:
ct_store_path = Path.cwd().parent / "models" / "column_transformer.pkl"
with open(ct_store_path, 'wb') as file:
    pickle.dump(ct, file)

# Training ML model

# Stack Model

In [11]:
estimators = [
    ('bnb', BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=False, force_alpha=True)),
    ('pa', PassiveAggressiveClassifier(C=5, fit_intercept=False, n_iter_no_change=50, n_jobs=10, shuffle=True))
]

stack_model = StackingClassifier(
    estimators= estimators,
    final_estimator= SGDClassifier(alpha=0.01, eta0=100, n_iter_no_change=50, n_jobs=1),
    passthrough=True,
)

stack_model.fit(X_train_norm, y_train)

0,1,2
,estimators,"[('bnb', ...), ('pa', ...)]"
,final_estimator,"SGDClassifier...=50, n_jobs=1)"
,cv,
,stack_method,'auto'
,n_jobs,
,passthrough,True
,verbose,0

0,1,2
,alpha,0.01
,force_alpha,True
,binarize,0.0
,fit_prior,False
,class_prior,

0,1,2
,C,5
,fit_intercept,False
,max_iter,1000
,tol,0.001
,early_stopping,False
,validation_fraction,0.1
,n_iter_no_change,50
,shuffle,True
,verbose,0
,loss,'hinge'

0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.01
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [12]:
pred = stack_model.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 0.798875
f1_score: 0.7116487455197132
precision_score: 0.7695736434108527
confusion_matrix:
[[8811 1189]
 [2029 3971]]
classification_report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.85     10000
           1       0.77      0.66      0.71      6000

    accuracy                           0.80     16000
   macro avg       0.79      0.77      0.78     16000
weighted avg       0.80      0.80      0.80     16000

roc_auc_score: 0.7714666666666667


In [13]:
file_path = Path.cwd().parent / "models" / "stack_model.pkl"
with open(file_path, 'wb') as file:
    pickle.dump(stack_model, file)

# XGBoost

In [14]:
xgbc = XGBClassifier(
    subsample=1.0,
    scale_pos_weight=5,
    reg_lambda=1.5,
    reg_alpha=0,
    n_estimators=500,
    min_child_weight=0.5,
    max_depth=12,
    max_delta_step=1,
    learning_rate=0.05,
    gamma=0.1,
    colsample_bytree=0.5,
    eval_metric='logloss'
)
xgbc.fit(X_train_norm, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5
,device,
,early_stopping_rounds,
,enable_categorical,False


In [15]:
probs = xgbc.predict_proba(X_test_norm)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, probs)

# Find threshold with high precision and acceptable recall
fscore = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(fscore)
best_threshold = thresholds[best_idx]

print(f"Best threshold: {best_threshold}")

# Predict with new threshold
pred_adj = (probs >= best_threshold).astype(int)
print(f"Adjusted precision: {precision_score(y_test, pred_adj)}")
print(f"Adjusted false positives: {(pred_adj & (y_test == 0)).sum()}")

Best threshold: 0.71337890625
Adjusted precision: 0.9281191168920075
Adjusted false positives: 420


In [16]:
probs = xgbc.predict_proba(X_test_norm)[:, 1]
pred = (probs >= best_threshold).astype(int)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"recall_score: {recall_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 0.9376875
f1_score: 0.9158152495144811
precision_score: 0.9281191168920075
recall_score: 0.9038333333333334
confusion_matrix:
[[9580  420]
 [ 577 5423]]
classification_report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10000
           1       0.93      0.90      0.92      6000

    accuracy                           0.94     16000
   macro avg       0.94      0.93      0.93     16000
weighted avg       0.94      0.94      0.94     16000

roc_auc_score: 0.9309166666666666


In [17]:
file_path = Path.cwd().parent / "models" / "xgbc.pkl"
with open(file_path, 'wb') as file:
    pickle.dump(xgbc, file)

# Decision Tree

In [18]:
dt = DecisionTreeClassifier()
dt.fit(X_train_norm, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
probs = dt.predict_proba(X_test_norm)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, probs)

# Find threshold with high precision and acceptable recall
fscore = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(fscore)
best_threshold = thresholds[best_idx]

print(f"Best threshold: {best_threshold}")

# Predict with new threshold
pred_adj = (probs >= best_threshold).astype(int)
print(f"Adjusted precision: {precision_score(y_test, pred_adj)}")
print(f"Adjusted false positives: {(pred_adj & (y_test == 0)).sum()}")

Best threshold: 0.5
Adjusted precision: 0.8564897959183674
Adjusted false positives: 879


In [20]:
probs = dt.predict_proba(X_test_norm)[:, 1]
pred = (probs >= best_threshold).astype(int)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 0.8979375
f1_score: 0.865319587628866
precision_score: 0.8564897959183674
confusion_matrix:
[[9121  879]
 [ 754 5246]]
classification_report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     10000
           1       0.86      0.87      0.87      6000

    accuracy                           0.90     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.90      0.90      0.90     16000

roc_auc_score: 0.8932166666666667


In [21]:
file_path = Path.cwd().parent / "models" / "dt.pkl"
with open(file_path, 'wb') as file:
    pickle.dump(dt, file)

# Random Forest

In [23]:
rf = RandomForestClassifier()
rf.fit(X_train_norm, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
probs = rf.predict_proba(X_test_norm)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, probs)

# Find threshold with high precision and acceptable recall
fscore = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(fscore)
best_threshold = thresholds[best_idx]

print(f"Best threshold: {best_threshold}")

# Predict with new threshold
pred_adj = (probs >= best_threshold).astype(int)
print(f"Adjusted precision: {precision_score(y_test, pred_adj)}")
print(f"Adjusted false positives: {(pred_adj & (y_test == 0)).sum()}")

Best threshold: 0.49
Adjusted precision: 0.9186922286888052
Adjusted false positives: 475


In [25]:
probs = rf.predict_proba(X_test_norm)[:, 1]
pred = (probs >= best_threshold).astype(int)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

accuracy score: 0.93075
f1_score: 0.9064347238642121
precision_score: 0.9186922286888052
confusion_matrix:
[[9525  475]
 [ 633 5367]]
classification_report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     10000
           1       0.92      0.89      0.91      6000

    accuracy                           0.93     16000
   macro avg       0.93      0.92      0.93     16000
weighted avg       0.93      0.93      0.93     16000

roc_auc_score: 0.9234999999999999


# Hyper-Parameter Tuning

In [None]:
estimators = [
    ('bnb', BernoulliNB()),
    ('pa', PassiveAggressiveClassifier())
]

stack_model = StackingClassifier(
    estimators= estimators,
    final_estimator= LogisticRegression(),
    passthrough=True,
)

param_grid = {
    'bnb__alpha': [0.1, 0.5, 1.0, 5.0],
    'bnb__binarize': [0.0, 0.5, 1.0, None],
    'bnb__fit_prior': [True, False],
    'pa__C': [0.01, 0.1, 1, 10],
    'pa__loss': ['hinge', 'squared_hinge'],
    'pa__max_iter': [50, 100, 200],
    'pa__tol': [1e-4, 1e-3, 1e-2],
    'pa__shuffle': [True, False],
    'pa__fit_intercept': [True, False],
    'final_estimator__C': [0.1, 1.0, 10, 100],
    'final_estimator__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'final_estimator__solver': ['liblinear', 'lbfgs', 'saga', 'newton-cg'],
    'final_estimator__max_iter': [100, 1000, 2500],
    'final_estimator__tol': [1e-4, 1e-3, 1e-2],
    'final_estimator__fit_intercept': [True, False],
    'final_estimator__class_weight': [None, 'balanced'],
}

cv = StratifiedKFold()

# grid_search = GridSearchCV(stack_model, param_grid=param_grid, n_jobs=-1, cv=cv)
random_search = RandomizedSearchCV(stack_model, param_distributions=param_grid, n_jobs=-1, cv=cv)

In [None]:
# grid_search.fit(X_train_norm, y_train)
random_search.fit(X_train_norm, y_train)

In [None]:
pred = random_search.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2']
}


cv = StratifiedKFold()

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, n_jobs=-1, cv=cv, verbose=3)
grid_search.fit(X_train_norm, y_train)

In [None]:
pred = grid_search.predict(X_test_norm)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

In [None]:
param_grid = {
    'max_depth': [3, 5, 7, 9, 12],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 400, 500],
    'min_child_weight': [0.5, 1, 3, 5],
    'gamma': [0, 0.05, 0.1, 0.2],
    'subsample': [0.5, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 3],
    'scale_pos_weight': [1, 3, 5, 7],
    'max_delta_step': [0, 1, 3]  # For imbalanced classes
}

xgb = XGBClassifier(eval_metric='logloss')

rand_search = RandomizedSearchCV(
    estimator=xgb, param_distributions=param_grid, scoring='roc_auc',
    cv=20, verbose=3, n_jobs=-1, n_iter=1000, random_state=42
)
rand_search.fit(X_train_norm, y_train)

In [None]:
probs = rand_search.predict_proba(X_test_norm)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, probs)

# Find threshold with high precision and acceptable recall
fscore = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(fscore)
best_threshold = thresholds[best_idx]

print(f"Best threshold: {best_threshold}")

# Predict with new threshold
pred_adj = (probs >= best_threshold).astype(int)
print(f"Adjusted precision: {precision_score(y_test, pred_adj)}")
print(f"Adjusted false positives: {(pred_adj & (y_test == 0)).sum()}")

In [None]:
probs = rand_search.predict_proba(X_test_norm)[:, 1]
pred = (probs >= 0.713).astype(int)
print(f"accuracy score: {accuracy_score(y_test, pred)}")
print(f"f1_score: {f1_score(y_test, pred)}")
print(f"precision_score: {precision_score(y_test, pred)}")
print(f"recoll_score: {recall_score(y_test, pred)}")
print(f"confusion_matrix:\n{confusion_matrix(y_test, pred)}")
print(f"classification_report:\n{classification_report(y_test, pred)}")
print(f"roc_auc_score: {roc_auc_score(y_test, pred)}")

In [None]:
rand_search.best_params_