### **Scrapping Fake URL's**


In [1]:
from bs4 import BeautifulSoup as bs
import requests 
import re
import time
import pandas as pd

url = "https://db.aa419.org/fakebankslist.php"
source = requests.get(url)
soup = bs(source.content)

In [2]:
links = []
for x in range(0,2301,21):
    page_link = f"https://db.aa419.org/fakebankslist.php?start={x}"
    source = requests.get(page_link)
    soup = bs(source.content)
    table = soup.find(class_ = "ewTable")
    for i in table.find_all('a', attrs={'href': re.compile("^https://")}):
        links.append(i)

In [3]:
data = pd.DataFrame(links)
data.to_csv("fake_urls.csv",index = None)
data.shape

(1549, 1)

### Scrapping Valid Url's

In [4]:
url = "https://gist.githubusercontent.com/jgamblin/62fadd8aa321f7f6a482912a6a317ea3/raw/33c6752125188cfdacdeee3f4fd6e01909e50eef/urls.txt"
res = requests.get(url, allow_redirects=True)
with open("valid_urls.csv","wb") as file:
    file.write(res.content)

### Cleaning the Datasets

##### Valid_urls Dataset

In [7]:
df = pd.read_csv("valid_urls.csv",header=None)
valid_urls = df.sample(n = 1000, random_state = 0)
valid_urls.reset_index(drop=True, inplace=True)
valid_urls["label_0"] = 0
print(valid_urls.shape)
valid_urls.head()

(1000, 2)


Unnamed: 0,0,label_0
0,chouftv.ma,0
1,india.com,0
2,google.cl,0
3,merdeka.com,0
4,yandex.com.tr,0


##### **Fake_urls Datasets**

In [8]:
df = pd.read_csv("fake_urls.csv",header=None)
fake_urls = df.sample(n = 1000, random_state = 0)
fake_urls.reset_index(drop=True, inplace=True)
fake_urls["label_1"] = 1
print(fake_urls.shape)

(1000, 2)


In [9]:
fake_urls[0] = fake_urls[0].str.replace("https://www.","",regex=True)
fake_urls.head()

Unnamed: 0,0,label_1
0,merchantplc.com,1
1,creditsunionb.com,1
2,faitcurrencyconsultant.com,1
3,thompsonpcapitals.com,1
4,eliziumcrypto.com,1


### Concating The Dataset

In [10]:
df_new = pd.concat([valid_urls,fake_urls],axis = 0,ignore_index=True)
df_new = df_new.rename(columns={df_new.columns[0]: "urls"})
df_new.head()

Unnamed: 0,urls,label_0,label_1
0,chouftv.ma,0.0,
1,india.com,0.0,
2,google.cl,0.0,
3,merdeka.com,0.0,
4,yandex.com.tr,0.0,


### Cleaning the Dataset

In [11]:
df_new = df_new.fillna(0)

In [12]:
df_new["labels"] = df_new["label_0"] + df_new["label_1"]

In [13]:
df_new = df_new.drop(columns=["label_0","label_1"],axis= 1)

In [14]:
df_new.head()

Unnamed: 0,urls,labels
0,chouftv.ma,0.0
1,india.com,0.0
2,google.cl,0.0
3,merdeka.com,0.0
4,yandex.com.tr,0.0


In [15]:
df_new.tail()

Unnamed: 0,urls,labels
1995,itm3568365.com,1.0
1996,nanixxs.com,1.0
1997,cab-kh.com,1.0
1998,allicabnl.com,1.0
1999,livestockgarden.com,1.0


In [16]:
df_new["labels"] = df_new["labels"].astype(int)

In [17]:
df_new["labels"].value_counts()

0    1000
1    1000
Name: labels, dtype: int64

In [18]:
df_new.head()

Unnamed: 0,urls,labels
0,chouftv.ma,0
1,india.com,0
2,google.cl,0
3,merdeka.com,0
4,yandex.com.tr,0


In [19]:
df_new.tail()

Unnamed: 0,urls,labels
1995,itm3568365.com,1
1996,nanixxs.com,1
1997,cab-kh.com,1
1998,allicabnl.com,1
1999,livestockgarden.com,1


In [20]:
df_new.to_csv("valid&fake_urls.csv", index = False)