In [2]:
import ipaddress
import requests
from googlesearch import search
import whois
from datetime import date, datetime
import time
from dateutil.parser import parse as date_parse
import socketimport re
import urllib.request
from bs4 import BeautifulSoup
import socket

socket.getaddrinfo('localhost', 8080)


def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month


def generate_data_set(url,label):
    try:
        data_set = []

        if not re.match(r"^https?", url):
            url = "http://" + url

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
        except:
            response = ""
            soup = -999

        domain = re.findall(r"://([^/]+)/?", url)[0]
        if re.match(r"^www.", domain):
            domain = domain.replace("www.", "")
        whois_response = whois.whois(domain)

        rank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {
            "name": domain
        })

        try:
            global_rank = int(re.findall(
                r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
        except:
            global_rank = -1

        # 1.having_IP_Address
        try:
            ipaddress.ip_address(url)
            data_set.append(-1)
        except:
            data_set.append(1)

        # 2.URL_Length
        if len(url) < 54:
            data_set.append(1)
        elif len(url) >= 54 and len(url) <= 75:
            data_set.append(0)
        else:
            data_set.append(-1)

        # 3.Shortining_Service
        match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                          'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                          'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                          'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                          'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                          'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                          'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', url)
        if match:
            data_set.append(-1)
        else:
            data_set.append(1)

        # 4.having_At_Symbol
        if re.findall("@", url):
            data_set.append(-1)
        else:
            data_set.append(1)

        # 5.double_slash_redirecting
        list = [x.start(0) for x in re.finditer('//', url)]
        if list[len(list)-1] > 6:
            data_set.append(-1)
        else:
            data_set.append(1)

        # 6.Prefix_Suffix
        if re.findall(r"https?://[^\-]+-[^\-]+/", url):
            data_set.append(-1)
        else:
            data_set.append(1)

        # 7.having_Sub_Domain
        if len(re.findall("\.", url)) == 1:
            data_set.append(1)
        elif len(re.findall("\.", url)) == 2:
            data_set.append(0)
        else:
            data_set.append(-1)

        # 8.SSLfinal_State
        try:
            if response.text:
                data_set.append(1)
        except:
            data_set.append(-1)

        # 9.Domain_registeration_length
        expiration_date = whois_response.expiration_date
        registration_length = 0
        try:
            expiration_date = min(expiration_date)
            today = time.strftime('%Y-%m-%d')
            today = datetime.strptime(today, '%Y-%m-%d')
            registration_length = abs((expiration_date - today).days)

            if registration_length / 365 <= 1:
                data_set.append(-1)
            else:
                data_set.append(1)
        except:
            data_set.append(-1)

        # 10.Favicon
        if soup == -999:
            data_set.append(-1)
        else:
            try:
                for datahead in soup.find_all('head'):
                    for datahead.link in soup.find_all('link', href=True):
                        dots = [x.start(0)
                                for x in re.finditer('\.', datahead.link['href'])]
                        if url in datahead.link['href'] or len(dots) == 1 or domain in datahead.link['href']:
                            data_set.append(1)
                            raise StopIteration
                        else:
                            data_set.append(-1)
                            raise StopIteration
            except StopIteration:
                pass


        # 12. HTTPS_token
        if re.findall(r"^https://", url):
            data_set.append(1)
        else:
            data_set.append(-1)



        # 14. URL_of_Anchor
        percentage = 0
        i = 0
        unsafe = 0
        if soup == -999:
            data_set.append(-1)
        else:
            for a in soup.find_all('a', href=True):
                if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or domain in a['href']):
                    unsafe = unsafe + 1
                i = i + 1

            try:
                percentage = unsafe / float(i) * 100
            except:
                data_set.append(1)

            if percentage < 31.0:
                data_set.append(1)
            elif ((percentage >= 31.0) and (percentage < 67.0)):
                data_set.append(0)
            else:
                data_set.append(-1)

        # 15. Links_in_tags
        i = 0
        success = 0
        if soup == -999:
            data_set.append(-1)

        else:
            for link in soup.find_all('link', href=True):
                dots = [x.start(0) for x in re.finditer('\.', link['href'])]
                if url in link['href'] or domain in link['href'] or len(dots) == 1:
                    success = success + 1
                i = i+1

            for script in soup.find_all('script', src=True):
                dots = [x.start(0) for x in re.finditer('\.', script['src'])]
                if url in script['src'] or domain in script['src'] or len(dots) == 1:
                    success = success + 1
                i = i+1
            try:
                percentage = success / float(i) * 100
            except:
                data_set.append(1)

            if percentage < 17.0:
                data_set.append(1)
            elif((percentage >= 17.0) and (percentage < 81.0)):
                data_set.append(0)
            else:
                data_set.append(-1)



        # 17. Submitting_to_email
        if response == "":
            data_set.append(-1)
        else:
            if re.findall(r"[mail\(\)|mailto:?]", response.text):
                data_set.append(-1)
            else:
                data_set.append(1)

        # 18. Abnormal_URL
        if response == "":
            data_set.append(-1)
        else:
            if response.text == whois_response:
                data_set.append(1)
            else:
                data_set.append(-1)

        # 19. Redirect
        if response == "":
            data_set.append(-1)
        else:
            if len(response.history) <= 1:
                data_set.append(-1)
            elif len(response.history) <= 4:
                data_set.append(0)
            else:
                data_set.append(1)

        # 20. on_mouseover
        if response == "":
            data_set.append(-1)
        else:
            if re.findall("<script>.+onmouseover.+</script>", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)

        # 21. RightClick
        if response == "":
            data_set.append(-1)
        else:
            if re.findall(r"event.button ?== ?2", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)

        # 22. popUpWidnow
        if response == "":
            data_set.append(-1)
        else:
            if re.findall(r"alert\(", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)

        # 23. Iframe
        if response == "":
            data_set.append(-1)
        else:
            if re.findall(r"[<iframe>|<frameBorder>]", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)

        # 24. age_of_domain
        if response == "":
            data_set.append(-1)
        else:
            try:
                registration_date = re.findall(
                        r'Registration Date:</div><div class="df-value">([^<]+)</div>', whois_response.text)[0]
                if diff_month(date.today(), date_parse(registration_date)) <= 6:
                    data_set.append(-1)
                else:
                    data_set.append(1)
            except:
                data_set.append(-1)

        # 25. DNSRecord
        dns = 1
        try:
            d = whois.whois(domain)
        except:
            dns = -1
        if dns == -1:
            data_set.append(-1)
        else:
            if registration_length / 365 <= 1:
                data_set.append(-1)
            else:
                data_set.append(1)

        # 26. web_traffic
        try:
            rank = BeautifulSoup(urllib.request.urlopen(
                "http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
            rank = int(rank)
            if (rank < 100000):
                data_set.append(1)
            else:
                data_set.append(0)
        except :
            data_set.append(-1)

        # 27. Page_Rank
        try:
            if global_rank > 0 and global_rank < 100000:
                data_set.append(1)
            else:
                data_set.append(-1)
        except:
            data_set.append(-1)

        # 28. Google_Index
        site = search(url, 5)
        if site:
            data_set.append(1)
        else:
            data_set.append(-1)
        data_set.append(label)      

        return data_set
    except:
        return -1
import pandas   
data = pandas.read_csv("C:\\Users\\shovi\\.ipynb_checkpoints\\top-1m.txt", sep='\t' )
data.columns = ['URLs']
data.head()
safe_url=data.sample(n=5000,random_state=12).copy()
safe_url=safe_url.reset_index(drop=True)
safe_url.head()
safe_url.shape

dataset=[]
for i in range(10000):
    #print(i)
    url = safe_url['URLs'][i]
    ret=generate_data_set(url,1)
    if ret==-1:
        #print("except")
        continue
    else:
        dataset.append(ret)
    
#print(dataset)

feature_names = ['Containing IP', 'length of url', 'Using Shortining', 'IS @ Symbol', '// redirecting', 'Prefix AND Suffix', 'Sub Domain', 'SSL', 'Domain Lifespan', 'IS Favicon', 'HTTPS', 'Anchor', 'tags Containing Links', 'Submit email', 'Is Abnormal', 'Redirect', 'on mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'domain age', 'DNSRecord', 'web traffic', 'Page Rank', 'Google Index', 'Result']
legitimate = pandas.DataFrame(dataset, columns= feature_names)

#legitimate.to_csv('C:\\Users\\shovi\\.ipynb_checkpoints\\resultday2.csv', index= True)

In [3]:
legitimate.to_csv('C:\\Users\\shovi\\.ipynb_checkpoints\\safe_websites.csv', index= True)

In [4]:
import pandas
data=pandas.read_csv("C:\\Users\\shovi\\.ipynb_checkpoints\\safe_websites.txt")
data.head()

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,1,-1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,2,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,3,1,1,1,1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,4,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,5,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1


In [10]:
data

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,1,-1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,2,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,3,1,1,1,1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,4,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,5,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6160,6161,-1,-1,-1,1,-1,-1,1,-1,-1,...,1,1,1,1,1,-1,0,-1,1,1
6161,6162,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,1,0,-1,1,1
6162,6163,-1,-1,1,1,1,-1,1,1,-1,...,1,1,1,1,1,1,0,-1,1,1
6163,6164,1,-1,1,1,1,-1,-1,1,1,...,1,1,1,1,1,1,0,-1,1,1


In [11]:
data2=pandas.read_csv("C:\\Users\\shovi\\.ipynb_checkpoints\\phish_1.txt")
data2.head()


Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,1,1,-1,-1,-1,-1,1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,1,1,-1,-1,0,-1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,1,1,-1,1,-1,1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,1,1,-1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,-1,-1,1,...,1,1,1,1,1,-1,-1,-1,1,-1


In [12]:
Phishing=pandas.DataFrame(data2)
Phishing

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,1,1,-1,-1,-1,-1,1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,1,1,-1,-1,0,-1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,1,1,-1,1,-1,1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,1,1,-1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,-1,-1,1,...,1,1,1,1,1,-1,-1,-1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,4894,-1,-1,1,1,-1,-1,1,-1,1,...,1,1,1,1,-1,1,1,-1,1,-1
4894,4895,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,1,1,1,1,1,1,-1
4895,4896,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,1,1,-1,1,-1
4896,4897,-1,-1,1,1,1,-1,-1,-1,1,...,-1,1,-1,1,1,1,1,-1,1,-1


In [13]:
safe=pandas.DataFrame(data)
safe

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,1,-1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,2,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,3,1,1,1,1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,4,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,5,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6160,6161,-1,-1,-1,1,-1,-1,1,-1,-1,...,1,1,1,1,1,-1,0,-1,1,1
6161,6162,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,1,0,-1,1,1
6162,6163,-1,-1,1,1,1,-1,1,1,-1,...,1,1,1,1,1,1,0,-1,1,1
6163,6164,1,-1,1,1,1,-1,-1,1,1,...,1,1,1,1,1,1,0,-1,1,1


In [15]:
main=pandas.concat([safe,Phishing]).reset_index(drop=True)
main

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,1,-1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,2,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,3,1,1,1,1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,4,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,5,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11058,4894,-1,-1,1,1,-1,-1,1,-1,1,...,1,1,1,1,-1,1,1,-1,1,-1
11059,4895,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,1,1,1,1,1,1,-1
11060,4896,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,1,1,-1,1,-1
11061,4897,-1,-1,1,1,1,-1,-1,-1,1,...,-1,1,-1,1,1,1,1,-1,1,-1


In [16]:
main.to_csv("C:\\Users\\shovi\\.ipynb_checkpoints\\main_dataset.txt")

In [17]:
main=pandas.read_csv("C:\\Users\\shovi\\.ipynb_checkpoints\\main_dataset.txt")

In [18]:
main

Unnamed: 0.1,Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,0,1,1,1,-1,1,1,1,1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,1,2,1,1,1,1,1,1,1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,2,3,1,1,1,1,1,1,1,1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,3,4,1,1,1,1,1,1,1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,4,5,1,1,1,1,1,1,1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11058,11058,4894,-1,-1,1,1,-1,-1,1,-1,...,1,1,1,1,-1,1,1,-1,1,-1
11059,11059,4895,-1,1,1,-1,-1,-1,1,-1,...,-1,1,-1,1,1,1,1,1,1,-1
11060,11060,4896,1,-1,1,1,1,-1,1,-1,...,1,1,1,1,1,1,1,-1,1,-1
11061,11061,4897,-1,-1,1,1,1,-1,-1,-1,...,-1,1,-1,1,1,1,1,-1,1,-1


In [19]:
main.pop("Unnamed: 0")

0            0
1            1
2            2
3            3
4            4
         ...  
11058    11058
11059    11059
11060    11060
11061    11061
11062    11062
Name: Unnamed: 0, Length: 11063, dtype: int64

In [20]:
main

Unnamed: 0,index,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,1,-1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,2,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,3,1,1,1,1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,4,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,5,1,1,1,1,1,1,1,1,-1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11058,4894,-1,-1,1,1,-1,-1,1,-1,1,...,1,1,1,1,-1,1,1,-1,1,-1
11059,4895,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,1,1,1,1,1,1,-1
11060,4896,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,1,1,-1,1,-1
11061,4897,-1,-1,1,1,1,-1,-1,-1,1,...,-1,1,-1,1,1,1,1,-1,1,-1


In [21]:
main.pop("index")

0           1
1           2
2           3
3           4
4           5
         ... 
11058    4894
11059    4895
11060    4896
11061    4897
11062    4898
Name: index, Length: 11063, dtype: int64

In [22]:
main

Unnamed: 0,Containing IP,length of url,Using Shortining,IS @ Symbol,// redirecting,Prefix AND Suffix,Sub Domain,SSL,Domain Lifespan,IS Favicon,...,on mouseover,RightClick,popUpWidnow,Iframe,domain age,DNSRecord,web traffic,Page Rank,Google Index,Result
0,1,1,-1,1,1,1,1,1,-1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
1,1,1,1,1,1,1,1,1,-1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
2,1,1,1,1,1,1,1,1,-1,1,...,-1,-1,1,1,-1,-1,1,-1,1,1
3,1,1,1,1,1,1,1,1,-1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
4,1,1,1,1,1,1,1,1,-1,1,...,-1,-1,-1,1,-1,-1,0,-1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11058,-1,-1,1,1,-1,-1,1,-1,1,1,...,1,1,1,1,-1,1,1,-1,1,-1
11059,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,1,-1,1,1,1,1,1,1,-1
11060,1,-1,1,1,1,-1,1,-1,-1,1,...,1,1,1,1,1,1,1,-1,1,-1
11061,-1,-1,1,1,1,-1,-1,-1,1,-1,...,-1,1,-1,1,1,1,1,-1,1,-1
