In [433]:
import pandas as pd
import numpy as np
import klib
from sklearn.utils import shuffle
import validators

In [434]:
# load the data from csv file
df = pd.read_csv('urldata.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


# Data Preprocessing

In [435]:
# shuffle data
df = shuffle(df)
df.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
25107,25107,https://www.dotconnectorblog.com/inspirational...,benign,0
71683,71683,https://www.mhpstrong.com/MHP/athletes/strongm...,benign,0
335824,335824,https://www.youtube.com/watch?v=m0oikdh7Z7Y,benign,0
339417,339417,https://www.lexmark.com/en_US/,benign,0
270563,270563,https://www.paperbackswap.com/Lullabies-Little...,benign,0


In [436]:
# rename index column
df = df.rename(columns={"Unnamed: 0":"index"})
df.head()

Unnamed: 0,index,url,label,result
25107,25107,https://www.dotconnectorblog.com/inspirational...,benign,0
71683,71683,https://www.mhpstrong.com/MHP/athletes/strongm...,benign,0
335824,335824,https://www.youtube.com/watch?v=m0oikdh7Z7Y,benign,0
339417,339417,https://www.lexmark.com/en_US/,benign,0
270563,270563,https://www.paperbackswap.com/Lullabies-Little...,benign,0


In [437]:
df.shape

(450176, 4)

In [438]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450176 entries, 25107 to 428468
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   index   450176 non-null  int64 
 1   url     450176 non-null  object
 2   label   450176 non-null  object
 3   result  450176 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 17.2+ MB


In [439]:
# using klib to convert existing dtypes to more efficient dtypes
df = klib.convert_datatypes(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450176 entries, 25107 to 428468
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   index   450176 non-null  int32   
 1   url     450176 non-null  string  
 2   label   450176 non-null  category
 3   result  450176 non-null  int8    
dtypes: category(1), int32(1), int8(1), string(1)
memory usage: 9.4 MB


In [440]:
df.isnull().sum().sum()

0

In [441]:
df['url'].nunique() 

450176

In [442]:
df['result'].value_counts()

0    345738
1    104438
Name: result, dtype: int64

In [443]:
# Class count
count_class_0, count_class_1 = df.result.value_counts()

# Divide by class
df_class_0 = df[df['result'] == 0]
df_class_1 = df[df['result'] == 1]

In [444]:
# oversample 1-class and concat the DataFrames of both classes

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_over.result.value_counts())

Random over-sampling:
0    345738
1    345738
Name: result, dtype: int64


In [445]:
df_over = shuffle(df_over)

In [446]:
X = df_over['url']
y = df_over['result']

# text preprocessing

In [447]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [448]:
text = X

In [449]:
Tokenizer = Tokenizer()

In [450]:
Tokenizer.fit_on_texts(text) 
Tokenizer_vocab_size = len(Tokenizer.word_index)+1
Tokenizer_vocab_size

416125

In [451]:
X.shape, y.shape

((691476,), (691476,))

In [452]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

In [453]:
X_train.shape, X_val.shape

((414885,), (276591,))

In [454]:
X_train_encoded_words = Tokenizer.texts_to_sequences(X_train)
X_val_encoded_words = Tokenizer.texts_to_sequences(X_val)

In [455]:
X_train_encoded_padded_words = seq.pad_sequences(X_train_encoded_words, maxlen=100) 
X_val_encoded_padded_words = seq.pad_sequences(X_val_encoded_words, maxlen=100)

X_train_encoded_padded_words.shape, X_val_encoded_padded_words.shape

((414885, 100), (276591, 100))

In [456]:
X_train_encoded_padded_words, X_val_encoded_padded_words

(array([[     0,      0,      0, ..., 175866,   5081,      1],
        [     0,      0,      0, ...,    354,    169,  93164],
        [     0,      0,      0, ...,      4,      9,      5],
        ...,
        [     0,      0,      0, ...,    668,    177,    186],
        [     0,      0,      0, ...,     31,    261,   3461],
        [     0,      0,      0, ...,    375,   8858,     11]]),
 array([[     0,      0,      0, ...,   7015,     16, 128164],
        [     0,      0,      0, ...,  11215,    573, 185928],
        [     0,      0,      0, ...,   2440,  21102, 193105],
        ...,
        [     0,      0,      0, ...,      6,   7413,   9584],
        [     0,      0,      0, ...,     24, 115047,      6],
        [     0,      0,      0, ...,    147,    144,  56795]]))

# Model

In [457]:
# xgboost
from xgboost import XGBClassifier
from sklearn.metrics  import accuracy_score

model = XGBClassifier().fit(X_train_encoded_padded_words , y_train)





In [458]:
ypred_xgb = model.predict(X_val_encoded_padded_words)

In [459]:
np.array(y_val.head())

array([1, 0, 0, 0, 1], dtype=int8)

In [460]:
ypred_xgb[0:5]

array([1, 0, 0, 0, 1], dtype=int8)

In [461]:
accuracy = accuracy_score(y_val, ypred_xgb)
print("accuracy: {:.2f}".format(accuracy*100))

accuracy: 99.67


# Predictions

In [462]:
# save the model and tokenizer
import pickle
pickle.dump(Tokenizer, open('models/tokenizer.pkl','wb'))
pickle.dump(model, open('models/xgb_model.pkl','wb'))

In [463]:
# load the saved model  and tokenizer
Tokenizer = pickle.load(open('models/tokenizer.pkl','rb'))
saved_model = pickle.load(open('models/xgb_model.pkl','rb'))

In [464]:
def predict(text):
    url = text
    valid = validators.url(url)
    if(valid==True):
        
        tokens = Tokenizer.texts_to_sequences([url])
        tokens = sequence.pad_sequences(tokens, maxlen=100)
        pred = saved_model.predict(np.array(tokens),)
        classes = ['Benign url','Malicious url']
        result = classes[pred[0]]
        return result
    else:
        return ("Entered url is Invalid")

In [465]:
predict("https://www.youtube.com/")

'Benign url'

In [466]:
predict("https://www.w3schools.com/sql/sql_orderby.asp")

'Benign url'

In [467]:
predict("https://insidethestorex.com/sd/")

'Malicious url'

In [468]:
predict("http://i_am_not_url.com")

'Entered url is Invalid'

In [469]:
predict("https://1fichier.com/?0ewdpuct0nz0ck46vi6h")

'Malicious url'

In [470]:
predict("https://skidrowreloaded.ufile.io/k1mfa3fi")

'Malicious url'