This notebook was run on Google Colab TPU.  Here, we are extracting features for 10000 urls of which 5K belongs to malicious urls and 5K belongs to benign urls

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install python-whois

Collecting python-whois
  Downloading python-whois-0.7.3.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 3.6 MB/s 
Building wheels for collected packages: python-whois
  Building wheel for python-whois (setup.py) ... [?25l[?25hdone
  Created wheel for python-whois: filename=python_whois-0.7.3-py3-none-any.whl size=87720 sha256=9df555fd470f67fd0f29c1e7a74f300018e56bc4cd3306f472a6ed02e7e6671c
  Stored in directory: /root/.cache/pip/wheels/11/05/f7/895ce5a73665f77c8274a7d55e34fb3e6b4abbb9a7637e215b
Successfully built python-whois
Installing collected packages: python-whois
Successfully installed python-whois-0.7.3


In [None]:
!pip install tldextract



In [None]:
import numpy as np
import pandas as pd
import whois
import datetime
from os.path import splitext
import tldextract
from urllib.parse import urlparse

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
#loading dataset which contains 5000 phishing URLs and 5000 legitimate URLs
dataset = pd.read_csv("/content/drive/MyDrive/benign-phishing-url-classification-using-whois-and-lexical-features-master/Data/CleanedDataset/merged_whois_verified_urls(short).csv")

In [None]:
dataset.head()

Unnamed: 0,url,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
2,mail.printakid.com/www.online.americanexpress....,1
3,thewhiskeydregs.com/wp-content/themes/widescre...,1
4,smilesvoegol.servebbs.org/voegol.php,1


url: phishing/legitimate URLs

label: 1 indicates url is Malicious, 0 indicates url is benign

In [None]:
dataset.shape #5000 rows and 2 columns

(501, 2)

In [None]:
#function to perform whois on given url
def perform_whois(url):
    try:
        whois_result = whois.whois(url)
        return whois_result
    except Exception:
        return False

Extracting whois features from URLs

In [None]:
#function to fetch the website age in days using URL created_date
def get_registered_date_in_days(whois_result):
    if(whois_result!=False):
        created_date = whois_result.creation_date
        if((created_date is not None) and (type(created_date)!=str)):
            if(type(created_date)==list):
                created_date=created_date[0]
            today_date=datetime.datetime.now()
            days = (today_date-created_date).days
            return days
        else:
            return -1
    else:
        return -1

In [None]:
#function to fetch the website expiry date in days using URL expiration_date
def get_expiration_date_in_days(whois_result):
    if(whois_result!=False):
        expiration_date = whois_result.expiration_date
        if((expiration_date is not None) and (type(expiration_date)!=str)):
            if(type(expiration_date)==list):
                expiration_date = expiration_date[0]
            today_date=datetime.datetime.now()
            days = (expiration_date-today_date).days
            return days
        else:
            return -1
    else:
        return -1

In [None]:
#function to fetch the website's last updated date in days using URL updated_date
def get_updated_date_in_days(whois_result):
    if(whois_result!=False):
        updated_date = whois_result.updated_date
        if((updated_date is not None) and (type(updated_date)!=str)):
            if(type(updated_date)==list):
                updated_date = updated_date[0]
            today_date=datetime.datetime.now()
            days = (today_date-updated_date).days
            return days
        else:
            return -1
    else:
        return -1

In [None]:
#dataset['url'][1601]

In [None]:
#perform_whois(dataset['url'][1])

In [None]:
#check for dnsresolver
"""def perform_dnsresolver(url):
    result = dns.resolver.query(url, 'A')
    print(type(result))
    for ipval in result:
        print('IP', ipval.to_text())"""

"def perform_dnsresolver(url):\n    result = dns.resolver.query(url, 'A')\n    print(type(result))\n    for ipval in result:\n        print('IP', ipval.to_text())"

In [None]:
"""perform_dnsresolver('google.com')"""

"perform_dnsresolver('google.com')"

In [None]:
"""sample =[]
def extract_all_features():
    for url in dataset['url']:
        sample.append(get_registered_date_in_days(url))
        #whois_resultset = get_whois_results(url)"""

"sample =[]\ndef extract_all_features():\n    for url in dataset['url']:\n        sample.append(get_registered_date_in_days(url))\n        #whois_resultset = get_whois_results(url)"

Extracting lexical features from URLs

In [None]:
def get_dot_count(url):
    return url.count('.')

In [None]:
def get_url_length(url):
    return len(url)

In [None]:
def get_digit_count(url):
    return sum(c.isdigit() for c in url)

In [None]:
def get_special_char_count(url):
    count = 0
    special_characters = [';','+=','_','?','=','&','[',']']
    for each_letter in url:
        if each_letter in special_characters:
            count = count + 1
    return count

In [None]:
def get_hyphen_count(url):
    return url.count('-')

In [None]:
def get_double_slash(url):
    return url.count('//')

In [None]:
def get_single_slash(url):
    return url.count('/')

In [None]:
def get_at_the_rate(url):
    return url.count('@')

In [None]:
def get_protocol(url):
    protocol = urlparse(url)
    if(protocol.scheme == 'http'):
        return 1
    else:
        return 0

In [None]:
def get_protocol_count(url):
    http_count = url.count('http')
    https_count = url.count('https')
    http_count = http_count - https_count #correcting the miscount of https as http
    return (http_count + https_count)

In [None]:
registered_date_in_days = []
expiration_date_in_days = []
updated_date_in_days = []
dotCount = []
urlLength = []
digitCount = []
specialCharCount = []
hyphenCount = []
doubleSlashCount = []
singleSlashCount = []
atTheRateCount = []
protocol = []
protocolCount = []

In [None]:
def extract_all_features():
    counter = 0
    for url in dataset['url']:
        counter = counter + 1
        print(counter)
        whois_result = perform_whois(url)
        #Extracting whois features from URLs
        registered_date_in_days.append(get_registered_date_in_days(whois_result))
        expiration_date_in_days.append(get_expiration_date_in_days(whois_result))
        updated_date_in_days.append(get_updated_date_in_days(whois_result))
        #Extracting lexical features from URLs
        dotCount.append(get_dot_count(url))
        urlLength.append(get_url_length(url))
        digitCount.append(get_digit_count(url))
        specialCharCount.append(get_special_char_count(url))
        hyphenCount.append(get_hyphen_count(url))
        doubleSlashCount.append(get_double_slash(url))
        singleSlashCount.append(get_single_slash(url))
        atTheRateCount.append(get_at_the_rate(url))
        protocol.append(get_protocol(url))
        protocolCount.append(get_protocol_count(url))

In [None]:
extract_all_features()

1
2
3
4
Error trying to connect to socket: closing socket
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
Error trying to connect to socket: closing socket
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
Error trying to connect to socket: closing socket
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
24

In [None]:
print(f'Registered Date list length               : {len(registered_date_in_days)}')
print(f'Expiration Date list length               : {len(expiration_date_in_days)}')
print(f'Updation Date list length                 : {len(updated_date_in_days)}')
print(f'Dot Count list length                     : {len(dotCount)}')
print(f'URL Length list length                    : {len(urlLength)}')
print(f'Digit Count list length                   : {len(digitCount)}')
print(f'Special Character Count list length       : {len(specialCharCount)}')
print(f'Hyphen Count list length                  : {len(hyphenCount)}')
print(f'Double Slash Count list length            : {len(doubleSlashCount)}')
print(f'Single Slash Count list length            : {len(singleSlashCount)}')
print(f'At the Rate(@) Count list length          : {len(atTheRateCount)}')
print(f'ProtocolName Count list length            : {len(protocol)}')
print(f'Protocol Count list length                : {len(protocolCount)}')

Registered Date list length               : 501
Expiration Date list length               : 501
Updation Date list length                 : 501
Dot Count list length                     : 501
URL Length list length                    : 501
Digit Count list length                   : 501
Special Character Count list length       : 501
Hyphen Count list length                  : 501
Double Slash Count list length            : 501
Single Slash Count list length            : 501
At the Rate(@) Count list length          : 501
ProtocolName Count list length            : 501
Protocol Count list length                : 501


In [None]:
features_df = pd.DataFrame()
features_df['whois_regDate'] = registered_date_in_days
features_df['whois_expDate'] = expiration_date_in_days
features_df['whois_updatedDate'] = updated_date_in_days
features_df["dot_count"] = dotCount
features_df["url_len"] = urlLength
features_df["digit_count"] = digitCount
features_df["special_count"] = specialCharCount
features_df["hyphen_count"] = hyphenCount
features_df["double_slash"] = doubleSlashCount
features_df["single_slash"] = singleSlashCount
features_df["at_the_rate"] = atTheRateCount
features_df["protocol"] = protocol
features_df["protocol_count"] = protocolCount

In [None]:
features_df.head()

Unnamed: 0,whois_regDate,whois_expDate,whois_updatedDate,dot_count,url_len,digit_count,special_count,hyphen_count,double_slash,single_slash,at_the_rate,protocol,protocol_count
0,742,-14,2,6,225,58,12,4,0,10,0,0,0
1,3859,158,205,7,177,47,0,1,0,11,0,0,0
2,6563,10,353,6,60,0,0,0,0,2,0,0,0
3,232,132,229,1,116,21,1,1,1,10,0,0,0
4,-1,-1,-1,3,36,0,0,0,0,1,0,0,0


In [None]:
features_df.shape

(501, 13)

In [None]:
features_df.to_csv("/content/drive/MyDrive/benign-phishing-url-classification-using-whois-and-lexical-features-master/Data/FeaturesDataset/features(short).csv",index=False)