<a href="https://colab.research.google.com/github/saugabriele/Machine_Learning_Project/blob/main/Machine_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Malicious Url Detector Project**
To develop a classifier for detecting malicious URLs, the project started with a dataset containing a few URLs. In this case I tried to create a multiclass classifier that can classify a URL like:
- **Benign**: These are safe to browse URLs.
- **Phishing**: By creating phishing URLs, hackers try to steal sensitive personal or financial information such as login credentials
- **Malware**:These type of URLs inject malware into the victim’s system once he/she visit such URLs.
- **Defacement**: Defacement URLs are generally created by hackers with the intention of breaking into a web server and replacing the hosted website with one of their own

In [20]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse

In [21]:
#Create a DataFrame from the dataset csv file
df = pd.read_csv("/content/drive/MyDrive/malicious_phish1.csv")
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [22]:
#Convert the DataFrame in numpy arrays
url_array = df['url'].to_numpy()
type_array = df['type'].to_numpy()

##**Features Extraction**
To determine the class label of an URL, the first step was selecting the features that can be extracted from an URL. In this case i decided to extract some lexical features:
- These refer to statistical features extracted from the literal URL string. For example, length of the URL string, number of digits, number of parameters in its query part, if the URL is encoded, etc.

In [4]:
def www_count(url):
  return url.count('www')

wwwCount = np.vectorize(www_count)

result = wwwCount(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))
x = np.zeros(shape = (result.shape[0], 1))
x[:, 0] = result
print(np.shape(x))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

xtr = scaler.fit_transform(x)
print(np.shape(xtr))
print(np.unique(xtr))

[0 0 0 ... 1 0 1]
651191
[0 1 2 3 4 5]
(651191, 1)
(651191, 1)
[0.  0.2 0.4 0.6 0.8 1. ]


In [5]:
def at_count(url):
  return url.count('@')

atCount = np.vectorize(at_count)

result = atCount(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

[0 0 0 ... 0 0 0]
651191
[ 0  1  2  3  4  6 10]


In [6]:
def url_lenght(url):
  return len(url)

urlLength = np.vectorize(url_lenght)
result = urlLength(url_array)
print(result)
print(result.shape[0])

[16 35 31 ... 42 45 41]
651191


In [7]:
def url_path_len(url):
  url_parse = urlparse(url)
  return len(url_parse.path)

urlPathLength = np.vectorize(url_path_len)
result_1 = urlPathLength(url_array)
print(result_1)
print(result_1.shape[0])

[16 35 31 ... 42 45 41]
651191


In [8]:
def host_len(url):
  url_parse = urlparse(url)
  return len(url_parse.netloc)

hostLength = np.vectorize(host_len)
result = hostLength(url_array)
print(result)
print(result.shape[0])

[0 0 0 ... 0 0 0]
651191


In [9]:
def url_host_is_ip(url):
  url = urlparse(url)
  reg = r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
  if re.match(reg, url.netloc):
      return 1
  else:
      return 0

#print(url_host_is_ip('http://paypal.com.cgi-bin-websc5.b4d80a13c0a2116480.ee0r-cmd-login-submit-dispatch-'))
#print(url_host_is_ip("http://95.154.196.187/broser/6716804bc5a91f707a34479012dad47c/"))

hostIsIp = np.vectorize(url_host_is_ip)
result = hostIsIp(url_array)
print(result)
print(result.shape[0])

[0 0 0 ... 0 0 0]
651191


In [10]:
def number_of_digits(url):
  digits = [i for i in url if i.isdigit()]
  return len(digits)

numberDigits = np.vectorize(number_of_digits)
result_1 = numberDigits(url_array)
print(result_1)
print(result_1.shape[0])

[0 1 1 ... 3 0 0]
651191


In [11]:
def number_of_parameters(url):
  params = urlparse(url).query
  return 0 if params == '' else len(params.split('&'))

numberParams = np.vectorize(number_of_parameters)
result = numberParams(url_array)
print(result)
print(result.shape[0])

[0 0 0 ... 0 0 0]
651191


In [12]:
def number_of_subdirectories(url):
  d = urlparse(url).path.split('/')
  return len(d) - 1

numberSubDirectories = np.vectorize(number_of_subdirectories)
result = numberSubDirectories(url_array)
print(result)
print(result.shape[0])
#print(np.unique(result))
#print(np.where(result == 10))
#print(urlparse(url_array[608]).path)

[0 2 3 ... 4 2 3]
651191


In [13]:
def number_of_periods(url):
  return url.count('.')

numberPeriods = np.vectorize(number_of_periods)
result = numberPeriods(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

[2 2 2 ... 2 2 2]
651191
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 31 34 35 36 37 42]


In [14]:
def num_encoded_char(url):
  return url.count('%')

numberEncodedChar = np.vectorize(num_encoded_char)
result = numberEncodedChar(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

[0 0 0 ... 0 0 0]
651191
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  74  75  76  77  78  80  82  83  84  86  87  88  89  90  92  93  94
  95  96  98 100 101 102 104 106 107 108 110 112 113 114 116 118 119 120
 122 126 128 130 132 133 134 142 144 146 147 149 150 198 201 231]


In [15]:
def equal_count(url):
    return url.count('=')

numberEquals = np.vectorize(equal_count)
result = numberEquals(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

[0 0 0 ... 0 0 0]
651191
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 24
 26 28 29 34 39 51]


In [16]:
def count_http(url):
    return url.count('http')

numberHttp = np.vectorize(count_http)
result = numberHttp(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

[0 0 0 ... 0 0 0]
651191
[0 1 2 3 4 5 6 9]


In [25]:
def count_https(url):
    return url.count('https')

numberHttps = np.vectorize(count_https)
result = numberHttps(url_array)
print(result)
print(result.shape[0])
print(np.unique(result))

function = []
function.append(np.vectorize(count_https))
result_10 = function[0](url_array)
print(result_10)
print(result_10.shape[0])
print(np.unique(result == result_10))

[0 0 0 ... 0 0 0]
651191
[0 1 2 3 4 5]
[0 0 0 ... 0 0 0]
651191
[ True]


In [28]:
def www_count(url):
  """
    From the input URL this will return the number of 
    occurrences of the substring 'www'.
  """
  return url.count('www')

def at_count(url):
  """
    From the input URL this will return the number of 
    occurrences of the substring '@'.
  """
  return url.count('@')

def url_lenght(url):
  """
    From the input URL this will return the
    corresponding length
  """
  return len(url)

def url_path_len(url):
  """
    From the input URL this will return the length
    of the path where the resource is located.
  """
  url_parse = urlparse(url)
  return len(url_parse.path)

def host_len(url):
  """
     From the input URL this will return the length
     of the hostname.
  """
  url_parse = urlparse(url)
  return len(url_parse.netloc)

def url_host_is_ip(url):
  """
    From the input URL this will return 1 if the hostname
    contains the IP adress otherwise it returns 0 if it
    contains the domain name where the resource is located.
  """
  url = urlparse(url)
  reg = r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
  if re.match(reg, url.netloc):
      return 1
  else:
      return 0

def number_of_digits(url):
  """
    From the input URL this will return the number of 
    digits in the URL.
  """
  digits = [i for i in url if i.isdigit()]
  return len(digits)

def number_of_parameters(url):
  """
    From the input URL this will return the number of
    parameters contained in the URL.
  """
  params = urlparse(url).query
  return 0 if params == '' else len(params.split('&'))

def number_of_subdirectories(url):
  """
    From the input URL this will return the number of
    subdirectories in the path of the URL.
  """
  d = urlparse(url).path.split('/')
  return len(d) - 1

def number_of_periods(url):
  """
    From the input URL this will return the number of
    periods in the URL.
  """
  return url.count('.')

def num_encoded_char(url):
  """
    From the input URL this will return the number of
    encoded characters in the URL.
  """
  return url.count('%')

def equal_count(url):
  """
    From the input URL this will return the number of
    variable values passed from one form page to another.
  """
  return url.count('=')

def count_http(url):
  """
    From the input URL this will return the number of 
    occurrences of the substring 'http'.
  """
  return url.count('http')

def count_https(url):
  """
    From the input URL this will return the number of 
    occurrences of the substring 'https'.
  """
  return url.count('https')

features_extraction_functions = []
features_extraction_functions.append(np.vectorize(www_count))
features_extraction_functions.append(np.vectorize(at_count))
features_extraction_functions.append(np.vectorize(url_lenght))
features_extraction_functions.append(np.vectorize(url_path_len))
features_extraction_functions.append(np.vectorize(host_len))
features_extraction_functions.append(np.vectorize(url_host_is_ip))
features_extraction_functions.append(np.vectorize(number_of_digits))
features_extraction_functions.append(np.vectorize(number_of_parameters))
features_extraction_functions.append(np.vectorize(number_of_subdirectories))
features_extraction_functions.append(np.vectorize(number_of_periods))
features_extraction_functions.append(np.vectorize(num_encoded_char))
features_extraction_functions.append(np.vectorize(equal_count))
features_extraction_functions.append(np.vectorize(count_http))
features_extraction_functions.append(np.vectorize(count_https))