<a href="https://colab.research.google.com/github/practicalClerk/Machine-Learning/blob/main/phishing_ml_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## We converted raw URLs into numerical features, these features represent structural and semantic characteristics of phishing URLs, making the data suitable for machine learning models

## Importing the required Libraries...

In [None]:
import pandas as pd
import numpy as np
import re

## Load the dataset and the features...


In [None]:
dataset = pd.read_csv('phishing_raw_labeled_dataset.csv')
urls = dataset['url'].values
lables = dataset['label'].values

## Function to loop through all the urls row by row and extract features...
The following features are being extracted :

    URL
    URL Length
    Count of '.'
    Count of '/'
    Count of '-'
    Check whether 'https' is present or not
    Check whether IP Address is present or not
    Check Suspicious keywords are present ornot : 'login', 'verify', 'update', 'secure', 'account'


In [None]:
def extract_features(url):
  feature_lst=[]

  feature_lst.append(url)

  feature_lst.append(len(url))

  feature_lst.append(url.count('.'))

  feature_lst.append(url.count('/'))

  feature_lst.append(url.count('-'))

  feature_lst.append(1 if url.startswith('https') else 0)

  ip_pattern = re.compile(
        r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}'
    )
  feature_lst.append(1 if ip_pattern.search(url) else 0)

  suspicious_words = ['login', 'verify', 'update', 'secure', 'account']
  for word in suspicious_words:
        feature_lst.append(1 if word in url.lower() else 0)

  return feature_lst


## Applying Feature Extraction to all the URLs...

In [None]:
final_feature_lst =[]

for url in urls:
  final_feature_lst.append(extract_features(url))

X = np.array(final_feature_lst)
y = np.array(lables)

## DataFrame with feature names

In [None]:
feature_names = [
    'url',
    'url_length',
    'num_dots',
    'num_slashes',
    'num_hyphens',
    'has_https',
    'has_ip',
    'kw_login',
    'kw_verify',
    'kw_update',
    'kw_secure',
    'kw_account'
]

features_df = pd.DataFrame(X, columns=feature_names)
features_df['label'] = y


In [None]:
features_df.to_csv('phishing_features.csv', index=False)
print("Feature extraction completed. File saved as phishing_features.csv")


Feature extraction completed. File saved as phishing_features.csv


## Data pre-processing:


# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd

# Loading the dataset and the x and y feature matrices

In [None]:
phishing_data = pd.read_csv('phishing_features.csv')

In [None]:
cat_features = phishing_data.select_dtypes(include = ['category','object']).columns

In [None]:
num_features = phishing_data.select_dtypes(include = ['int64','float64']).columns

In [None]:
print("Categorical data :",cat_features)

Categorical data : Index(['url'], dtype='object')


In [None]:
print("numerical features :",num_features)

numerical features : Index(['url_length', 'num_dots', 'num_slashes', 'num_hyphens', 'has_https',
       'has_ip', 'kw_login', 'kw_verify', 'kw_update', 'kw_secure',
       'kw_account', 'label'],
      dtype='object')


In [None]:
x = phishing_data.iloc[:,:-1].values

In [None]:
y = phishing_data.iloc[:,-1].values

In [None]:
print(x)

[[11  1  0 ...  0  0  0]
 [12  1  0 ...  0  0  0]
 [12  1  0 ...  0  0  0]
 ...
 [13  1  0 ...  0  0  0]
 [61  2  3 ...  0  0  0]
 [87  1 12 ...  0  0  0]]


In [None]:
print(y)

[0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0
 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0
 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0
 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 0 0 0 1
 0 1 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0
 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0
 0 1 0 1 0 1 0 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0
 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 1 0 0 0 0 1 1 1
 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0
 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0
 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 1 0 0 0 0 0
 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 0 0 0 0 0 0
 1 1 1 0 1 1 1 1 1 0 0 0 

# Identifying missing data(if any) and then handling them by filling the mean of that feature list.

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
x[:,1:12] = imputer.fit_transform(x[:,1:12])


In [None]:
print(x)

[['wasalat.net' 11.0 1.0 ... 0.0 0.0 0.0]
 ['fy-hotel.com' 12.0 1.0 ... 0.0 0.0 0.0]
 ['elemecdn.com' 12.0 1.0 ... 0.0 0.0 0.0]
 ...
 ['communick.com' 13.0 1.0 ... 0.0 0.0 0.0]
 ['tsrocks.com/m/merle_haggard_texts/someday_well_look_back.html' 61.0
  2.0 ... 0.0 0.0 0.0]
 ['http://distractify.com/post/related/id/5457e73d4a0c4b4a65fed9b0/skip/30/limit/10/back/0'
  87.0 1.0 ... 0.0 0.0 0.0]]


# Splitting the dataset into train(80%) and test(20%) data-sets

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)


In [None]:
print(x_train)

[['quick.com.br' 12.0 2.0 ... 0.0 0.0 0.0]
 ['recovery-help-centre20183101.000webhostapp.com' 46.0 2.0 ... 0.0 0.0
  0.0]
 ['godubai.com' 11.0 1.0 ... 0.0 0.0 0.0]
 ...
 ['somogyim.hu' 11.0 1.0 ... 0.0 0.0 0.0]
 ['www.eecg.toronto.edu/icpp2002/' 30.0 3.0 ... 0.0 0.0 0.0]
 ['gacraze.net' 11.0 1.0 ... 0.0 0.0 0.0]]


In [None]:
print(x_test)

[['http://paypal-support-help-uk.com/' 34.0 1.0 ... 0.0 0.0 0.0]
 ['imdb.com/name/nm0232574/' 24.0 1.0 ... 0.0 0.0 0.0]
 ['digital.lib.csus.edu/curr/' 26.0 3.0 ... 0.0 0.0 0.0]
 ...
 ['123people.co.uk/s/charles+wilson' 32.0 2.0 ... 0.0 0.0 0.0]
 ['24ora.com' 9.0 1.0 ... 0.0 0.0 0.0]
 ['winwinbot.com' 13.0 1.0 ... 0.0 0.0 0.0]]


In [None]:
print(y_train)

[0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0
 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1
 1 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1 1
 0 0 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0
 0 1 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0
 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1
 0 0 1 0 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0
 1 0 1 1 1 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1
 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1
 1 1 1 1 1 0 1 1 1 1 0 1 

In [None]:
print(y_test)

[1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 0
 1 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1
 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 1 0
 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 1 1 0 0 1 0
 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1
 1 1 1 0 1 1 0 0 0 1 1 0 1 0 0]


# Feture Scaling to the following columns(using standardization)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,1:12] = sc.fit_transform(x_train[:,1:12])

x_test = sc.transform(x_test[:,1:12])

In [None]:
print(x_test)

[[-0.10042535 -0.65383836  0.77502055 ... -0.06135239 -0.07088812
   0.        ]
 [-0.36795874 -0.65383836  0.77502055 ... -0.06135239 -0.07088812
   0.        ]
 [-0.31445206  1.48989396  0.26970968 ... -0.06135239 -0.07088812
   0.        ]
 ...
 [-0.15393203  0.4180278   0.26970968 ... -0.06135239 -0.07088812
   0.        ]
 [-0.76925883 -0.65383836 -0.74091207 ... -0.06135239 -0.07088812
   0.        ]
 [-0.66224547 -0.65383836 -0.74091207 ... -0.06135239 -0.07088812
   0.        ]]


In [None]:
print(x_train)

[['quick.com.br' -0.6889988142397997 0.4180278012848637 ...
  -0.06135238734453008 -0.07088812050083386 0.0]
 ['recovery-help-centre20183101.000webhostapp.com' 0.22061472492064074
  0.4180278012848637 ... -0.06135238734453008 -0.07088812050083386 0.0]
 ['godubai.com' -0.7157521536268715 -0.6538383558558127 ...
  -0.06135238734453008 -0.07088812050083386 0.0]
 ...
 ['somogyim.hu' -0.7157521536268715 -0.6538383558558127 ...
  -0.06135238734453008 -0.07088812050083386 0.0]
 ['www.eecg.toronto.edu/icpp2002/' -0.2074387052725077 1.4898939584255402
  ... -0.06135238734453008 -0.07088812050083386 0.0]
 ['gacraze.net' -0.7157521536268715 -0.6538383558558127 ...
  -0.06135238734453008 -0.07088812050083386 0.0]]
