In [1]:
pip install -r requirements.txt

Collecting python-whois
  Downloading python-whois-0.7.3.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 4.1 MB/s 
[?25hCollecting pyquery
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting shodan
  Downloading shodan-1.27.0.tar.gz (52 kB)
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
[?25hCollecting waybackpy
  Downloading waybackpy-3.0.6-py3-none-any.whl (34 kB)
Collecting cssselect>0.7.9
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting XlsxWriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 31.4 MB/s 
Building wheels for collected packages: python-whois, shodan
  Building wheel for python-whois (setup.py) ... [?25l[?25hdone
  Created wheel for python-whois: filename=python_whois-0.7.3-py3-non

In [1]:
import pandas as pd

import Content_Based_Feature_Functions as cbf
import Host_Based_Feature_Functions as hbf
import Lexical_Based_Feature_Functions as lbf

from google.colab import files

In [2]:
dataset = pd.read_csv('data.csv')
# .iloc[0:100,:]
dataset


Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad
...,...,...
420459,23.227.196.215/,bad
420460,apple-checker.org/,bad
420461,apple-iclods.org/,bad
420462,apple-uptoday.org/,bad


In [3]:
def apply_http(url):
    if url.strip().startswith('http'):
        return url
    else:
        return 'http://'+url
    
dataset['url'] = dataset['url'].apply(lambda x: apply_http(x))

# dataset = dataset[dataset['label'] == 'good']
dataset = dataset.replace(to_replace='good', value=0)
dataset = dataset.replace(to_replace='bad', value=1)

dataset

Unnamed: 0,url,label
0,http://diaryofagameaddict.com,1
1,http://espdesign.com.au,1
2,http://iamagameaddict.com,1
3,http://kalantzis.net,1
4,http://slightlyoffcenter.net,1
...,...,...
420459,http://23.227.196.215/,1
420460,http://apple-checker.org/,1
420461,http://apple-iclods.org/,1
420462,http://apple-uptoday.org/,1


In [4]:
# *** Content Based Features ***
def content_based_feature_extraction(dataset):
  dataset['html'] = dataset['url'].apply(lambda x: cbf.get_html_page(x))
  dataset['ip'] = dataset['url'].apply(lambda x: cbf.get_ip(x))
  dataset['page_entropy'] = dataset['html'].apply(lambda x: cbf.get_entropy(x))
  dataset['pyquery'] = dataset['html'].apply(lambda x: cbf.get_pyquery(x))
  dataset['number_of_script_tags'] = dataset['html'].apply(lambda x: cbf.number_of_script_tags(x))
  dataset['script_to_body_ratio'] = dataset['html'].apply(lambda x: cbf.script_to_body_ratio(x))
  dataset['length_of_html'] = dataset['html'].apply(lambda x: cbf.length_of_html(x))
  dataset['number_of_tokens'] = dataset['html'].apply(lambda x: cbf.number_of_page_tokens(x))
  dataset['number_of_sentences'] = dataset['html'].apply(lambda x: cbf.number_of_sentences(x))
  dataset['number_of_punctuations'] = dataset['html'].apply(lambda x: cbf.number_of_punctuations(x))
  dataset['number_of_capitalizations'] = dataset['html'].apply(lambda x: cbf.number_of_capitalizations(x))
  dataset['average_number_of_tokens_in_sentence'] = dataset['html'].apply(lambda x: cbf.average_number_of_tokens_in_sentence(x))
  dataset['number_of_html_tags'] = dataset['html'].apply(lambda x: cbf.number_of_html_tags(x))
  dataset['number_of_hidden_tags'] = dataset['html'].apply(lambda x: cbf.number_of_hidden_tags(x))
  dataset['number_of_iframes'] = dataset['html'].apply(lambda x: cbf.number_of_iframes(x))
  dataset['number_of_objects'] = dataset['html'].apply(lambda x: cbf.number_of_objects(x))
  dataset['number_of_embeds'] = dataset['html'].apply(lambda x: cbf.number_of_embeds(x))
  dataset['number_of_hyperlinks'] = dataset['html'].apply(lambda x: cbf.number_of_hyperlinks(x))
  dataset['number_of_whitespaces'] = dataset['html'].apply(lambda x: cbf.number_of_whitespaces(x))
  dataset['number_of_included_elements'] = dataset['html'].apply(lambda x: cbf.number_of_included_elements(x))
  dataset['number_of_suspicious_elements'] = dataset['html'].apply(lambda x: cbf.number_of_suspicious_elements(x))
  dataset['number_of_double_documents'] = dataset['html'].apply(lambda x: cbf.number_of_double_documents(x))
  dataset['number_of_eval_functions'] = dataset['html'].apply(lambda x: cbf.number_of_eval_functions(x))
  dataset['average_script_length'] = dataset['html'].apply(lambda x: cbf.average_script_length(x))
  dataset['average_script_entropy'] = dataset['html'].apply(lambda x: cbf.average_script_entropy(x))
  dataset['number_of_suspicious_functions'] = dataset['html'].apply(lambda x: cbf.number_of_suspicious_functions(x))
  return
# display(dataset)

In [5]:
# *** Host Based Features ***
def host_based_feature_extraction(dataset):
  dataset['num_subdomains'] = dataset['url'].apply(lambda x: hbf.number_of_subdomains(x))
  dataset['creation_date'] = dataset['url'].apply(lambda x: hbf.url_creation_date(x))
  dataset['expiration_date'] = dataset['url'].apply(lambda x: hbf.url_expiration_date(x))
  dataset['recent_update'] = dataset['url'].apply(lambda x: hbf.url_last_updated(x))
  dataset['age'] = dataset['url'].apply(lambda x: hbf.url_age(x))
  dataset['lifespan'] = dataset['url'].apply(lambda x: hbf.url_intended_life_span(x))
  dataset['life_remaining'] = dataset['url'].apply(lambda x: hbf.url_life_remaining(x))
  dataset['registrar'] = dataset['url'].apply(lambda x: hbf.url_registrar(x))
  dataset['registration_country'] = dataset['url'].apply(lambda x: hbf.url_registration_country(x))
  dataset['host_country'] = dataset['url'].apply(lambda x: hbf.url_host_country(x))
  dataset['open_ports'] = dataset['url'].apply(lambda x: hbf.url_open_ports(x))
  dataset['num_open_ports'] = dataset['url'].apply(lambda x: hbf.url_num_open_ports(x))
  # dataset['is_live'] = dataset['url'].apply(lambda x: int(hbf.url_is_live(x)))
  dataset['isp'] = dataset['url'].apply(lambda x: hbf.url_isp(x))
  # dataset['connection_speed'] = dataset['url'].apply(lambda x: hbf.url_connection_speed(x))
  dataset['first_seen'] = dataset['url'].apply(lambda x: hbf.first_seen(x))
  dataset['last_seen'] = dataset['url'].apply(lambda x: hbf.last_seen(x))
  dataset['days_since_last_seen'] = dataset['url'].apply(lambda x: hbf.days_since_last_seen(x))
  dataset['days_since_first_seen'] = dataset['url'].apply(lambda x: hbf.days_since_first_seen(x))
  # dataset['avg_update_frequency'] = dataset['url'].apply(lambda x: hbf.average_update_frequency(x))
  # dataset['num_updates'] = dataset['url'].apply(lambda x: hbf.number_of_updates(x))
  dataset['ttl'] = dataset['url'].apply(lambda x: hbf.ttl_from_registration(x))
  return dataset

# display(dataset)
# display(dataset[dataset['registration_country'].isnull()])

In [6]:
# *** Lexical Based Features ***
def lexical_based_feature_extraction(dataset):
  dataset['url_string_entropy'] = dataset['url'].apply(lambda x: lbf.url_string_entropy(x))
  dataset['url_scheme'] = dataset['url'].apply(lambda x: lbf.url_scheme(x)) #string
  dataset['url_length'] = dataset['url'].apply(lambda x: lbf.url_length(x))
  dataset['url_path_length'] = dataset['url'].apply(lambda x: lbf.url_path_length(x))
  dataset['url_host_length'] = dataset['url'].apply(lambda x: lbf.url_host_length(x))
  dataset['url_has_port_in_string'] = dataset['url'].apply(lambda x: int(lbf.url_has_port_in_string(x)))
  dataset['number_of_digits'] = dataset['url'].apply(lambda x: lbf.number_of_digits(x))
  dataset['number_of_parameters'] = dataset['url'].apply(lambda x: lbf.number_of_parameters(x))
  dataset['number_of_fragments'] = dataset['url'].apply(lambda x: lbf.number_of_fragments(x))
  dataset['is_encoded'] = dataset['url'].apply(lambda x: int(lbf.is_encoded(x)))
  dataset['num_encoded_char'] = dataset['url'].apply(lambda x: lbf.num_encoded_char(x))
  dataset['number_of_subdirectories'] = dataset['url'].apply(lambda x: lbf.number_of_subdirectories(x))
  dataset['number_of_periods'] = dataset['url'].apply(lambda x: lbf.number_of_periods(x))
  dataset['has_client_in_string'] = dataset['url'].apply(lambda x: int(lbf.has_client_in_string(x)))
  dataset['has_admin_in_string'] = dataset['url'].apply(lambda x: int(lbf.has_admin_in_string(x)))
  dataset['has_server_in_string'] = dataset['url'].apply(lambda x: int(lbf.has_server_in_string(x)))
  dataset['has_login_in_string'] = dataset['url'].apply(lambda x: int(lbf.has_login_in_string(x)))
  dataset['get_tld'] = dataset['url'].apply(lambda x: lbf.get_tld(x)) #string
  return dataset
  
#Final Dataset:
# display(dataset)

In [None]:
no_of_rows = 500
no_of_batches = int(dataset.shape[0]/no_of_rows) +1
print(no_of_batches)

for i in range(0, 200):
    start = no_of_rows*i
    if start + no_of_rows > dataset.shape[0] :
        df = dataset.iloc[start:,:]
    else :
        df = dataset.iloc[start:start+no_of_rows, :]
    df = df.reset_index()

    content_based_feature_extraction(df)
    host_based_feature_extraction(df)
    lexical_based_feature_extraction(df)

    df = df.drop(['html', 'ip', 'pyquery', 'first_seen', 'last_seen'], axis=1)

    df.to_csv('output-2-'+str(i)+'.csv', encoding = 'utf-8-sig') 
    files.download('output-2-'+str(i)+'.csv')


690
http://missouririverexp.com/
http://missouririverfutures.org/
http://missouririverproperties.com/
http://missouririverringnecks.net/
http://missouririversoap.blogspot.com/
http://missouririverwinetrail.com/
http://missouririverwinetrail.ticketleap.com/
http://missouriroute66.blogspot.com/
http://missouriroute66.blogspot.com/p/joplin.html
http://missourisportsmag.com/?p=10469
http://missourisportsmag.com/?p=10853
http://missourisportsmag.com/?p=14421
http://missourisportsmag.com/?p=1687
http://missourisportsmag.com/?p=18153
http://missourisportsmag.com/?p=21296
http://missourisportsmag.com/?p=21944
http://missourisportsmag.com/?p=23723
http://missourisportsmag.com/?p=23901
http://missourisportsmag.com/?p=23951
http://missourisportsmag.com/?p=28582
http://missourisportsmag.com/?p=29356
http://missourisportsmag.com/?p=30973
http://missourisportsmag.com/?p=31658
http://missourisportsmag.com/?p=5234
http://missourisportsmag.com/?p=819
http://missourisportsmag.com/?p=8440
http://missouri