In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
import pickle
import re

In [11]:
df = pd.read_csv(r"C:/Users/s7522/Desktop/detection_system/dataset/data_url.csv")

In [12]:
df['label'] = df['label'].map({'bad': 0, 'good': 1})


In [13]:
df

Unnamed: 0,url,label
0,diaryofagameaddict.com,0
1,espdesign.com.au,0
2,iamagameaddict.com,0
3,kalantzis.net,0
4,slightlyoffcenter.net,0
...,...,...
420459,23.227.196.215/,0
420460,apple-checker.org/,0
420461,apple-iclods.org/,0
420462,apple-uptoday.org/,0


In [14]:
def clean_text(text):
    text = text.lower()
    token = re.split(r'[\/\-._:]', text)
    token.append(text)
    if "www" in token:
        token.remove("www")
    if "com" in token:
        token.remove("com")
    if "https" in token:
        token.remove("https")
    if  "http" in token:
        token.remove("http")
    token = token[:-1]
    return token


In [15]:
#ML 
urls = df["url"]
y = df["label"]
vectorizer = TfidfVectorizer(tokenizer=clean_text)
x = vectorizer.fit_transform(urls)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



In [16]:
y_train

226702    1
370689    1
304005    1
235092    1
293973    1
         ..
259178    1
365838    1
131932    1
146867    1
121958    1
Name: label, Length: 336371, dtype: int64

In [61]:
models = {"XGBClassifier" : XGBClassifier(), "XGBRFClassifier" : XGBRFClassifier(), 
          "LGBMClassifier" : LGBMClassifier(),  
          "SVC": SVC(), "RandomForestClassifier" : RandomForestClassifier()}

for name, model in models.items():
    print(name)
    train = model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    print("F1 Score")
    print(f1_score(y_test, prediction))
    print("Roc Auc Score")
    print(roc_auc_score(y_test, prediction))
    print("Confusion Matrix")
    print(confusion_matrix(y_test, prediction))

XGBClassifier
F1 Score
0.954627240393333
Roc Auc Score
0.8005977223433711
Confusion Matrix
[[ 9145  5819]
 [  687 68442]]
XGBRFClassifier
F1 Score
0.9256181514521976
Roc Auc Score
0.6590548558131322
Confusion Matrix
[[ 4968  9996]
 [  960 68169]]
LGBMClassifier
[LightGBM] [Info] Number of positive: 275692, number of negative: 60679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.356521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207661
[LightGBM] [Info] Number of data points in the train set: 336371, number of used features: 10484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.819607 -> initscore=1.513687
[LightGBM] [Info] Start training from score 1.513687
F1 Score
0.9567832167832168
Roc Auc Score
0.81232831323348
Confusion Matrix
[[ 9503  5461]
 [  719 68410]]
SVC


In [17]:
#Best Model is : LGBMClassifier
lgbm_classifier = LGBMClassifier().fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 275692, number of negative: 60679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.779803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 207661
[LightGBM] [Info] Number of data points in the train set: 336371, number of used features: 10484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.819607 -> initscore=1.513687
[LightGBM] [Info] Start training from score 1.513687


In [18]:
file = "url_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgbm_classifier, f)
f.close()

file2 = "pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer, f2)
f2.close()
 