# Feature Extraction 

Part One - Length Features

1.Importing Dependencies

In [1]:
import numpy as np
import pandas as pd

from urllib.parse import urlparse
from tld import get_tld
import os.path

2.Reading The Dataset

In [2]:
urldata = pd.read_csv("A:/MajorProject/Datasets/urlfinalapi.csv")

In [3]:
urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [5]:
#Removing Unnamed Column 
urldata = urldata.drop("Unnamed: 0",1)

In [6]:
urldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 3 columns):
url       450176 non-null object
label     450176 non-null object
result    450176 non-null int64
dtypes: int64(1), object(2)
memory usage: 6.9+ MB


In [7]:
urldata.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [9]:
urldata.tail()

Unnamed: 0,url,label,result
450202,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1
450203,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450204,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450205,http://atualizapj.com/,malicious,1
450206,http://writeassociate.com/test/Portal/inicio/I...,malicious,1


# Length Features 

1.Length Of Url<br>
2.Length of Hostname<br>
3.Length Of Path<br>
4.Length Of First Directory<br>
5.Length Of Top Level Domain<br>

1.Length of URL

In [5]:
urldata['url_length'] = urldata['url'].apply(lambda i: len(str(i)) )

2.Length Of Hostname

In [6]:
#Function to return hostname length from the URL
def hostname_length(url):
    length = len(urlparse(url).netloc)
    return length

#Applying hostname length function to each url in dataset
urldata['hostname_length'] = urldata['url'].apply(lambda i: hostname_length(i))

3.Length Of Path

In [7]:
#Function to return path length from the URL
def path_length(url):
     return urlparse(url).path
     

#Applying hostname length function to each url in dataset
urldata['path_length'] = urldata['url'].apply(lambda i: len(path_length(i)))

4.Length Of First Directory

In [8]:
#Calculate The Length Of First Directory in URL
def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

urldata['fd_length'] = urldata['url'].apply(lambda i: fd_length(i))

In [9]:
urldata

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length
0,https://www.google.com,benign,0,22,14,0,0
1,https://www.youtube.com,benign,0,23,15,0,0
2,https://www.facebook.com,benign,0,24,16,0,0
3,https://www.baidu.com,benign,0,21,13,0,0
4,https://www.wikipedia.org,benign,0,25,17,0,0
5,https://www.reddit.com,benign,0,22,14,0,0
6,https://www.yahoo.com,benign,0,21,13,0,0
7,https://www.google.co.in,benign,0,24,16,0,0
8,https://www.qq.com,benign,0,18,10,0,0
9,https://www.amazon.com,benign,0,22,14,0,0


5.Length Of Top Level Domain (TLD)

In [10]:
urldata['tld'] = urldata['url'].apply(lambda i: get_tld(i,fail_silently=True))

In [11]:
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

urldata['tld_length'] = urldata['tld'].apply(lambda i: tld_length(i))

In [12]:
urldata = urldata.drop("tld",1)

In [13]:
urldata[urldata['tld_length']==-1]

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,tld_length
345994,http://93.186.251.133/exchange/signup/login.php,malicious,1,47,14,26,8,-1
346012,http://37.60.238.238/~skylins3/,malicious,1,31,13,11,9,-1
346052,http://80.211.171.247/foxbit.exchange/%23signin/,malicious,1,48,14,27,15,-1
346144,http://80.211.156.45,malicious,1,20,13,0,0,-1
347192,http://129.121.2.196/~abcddcom/de/,malicious,1,34,13,14,9,-1
348240,http://80.211.155.92/nf/iTokenApp/--/chama.php...,malicious,1,58,13,26,2,-1
348282,http://65.39.182.150/~osif/US/www_usaa_come/in...,malicious,1,71,13,51,5,-1
348396,http://108.179.232.168/~mobilemo/postal/8d5ca7...,malicious,1,73,15,51,9,-1
348397,http://108.179.232.168/~mobilemo/postal/,malicious,1,40,15,18,9,-1
348424,http://54.164.181.135/Falabella/BancoFalabella...,malicious,1,50,14,29,9,-1


In [14]:
urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,tld_length
0,https://www.google.com,benign,0,22,14,0,0,3
1,https://www.youtube.com,benign,0,23,15,0,0,3
2,https://www.facebook.com,benign,0,24,16,0,0,3
3,https://www.baidu.com,benign,0,21,13,0,0,3
4,https://www.wikipedia.org,benign,0,25,17,0,0,3


Writing Data To New CSV File

In [15]:
urldata.to_csv("A:/MajorProject/FinalDatasets/url-lengthfeatures.csv")