### **Scrapping Fake URL's**


In [51]:
from bs4 import BeautifulSoup as bs
import requests 
import re
import pandas as pd

url = "https://db.aa419.org/fakebankslist.php"
source = requests.get(url)
soup = bs(source.content)

In [31]:
links = []
for x in range(0,2001,21):
    page_link = f"https://db.aa419.org/fakebankslist.php?start={x}"
    source = requests.get(page_link)
    soup = bs(source.content)
    table = soup.find(class_ = "ewTable")
    for i in table.find_all('a', attrs={'href': re.compile("^https://")}):
        links.append(i)

In [32]:
data = pd.DataFrame(links)
data.to_csv("fake_urls.csv",index = None)

### Scrapping Valid Url's

In [37]:
url = "https://gist.githubusercontent.com/jgamblin/62fadd8aa321f7f6a482912a6a317ea3/raw/33c6752125188cfdacdeee3f4fd6e01909e50eef/urls.txt"
res = requests.get(url, allow_redirects=True)
with open("valid_urls.csv","wb") as file:
    file.write(res.content)

### Cleaning the Datasets

##### Valid_urls Dataset

In [281]:
df = pd.read_csv("valid_urls.csv",header=None)
valid_urls = df.sample(n = 600, random_state = 0)
valid_urls.reset_index(drop=True, inplace=True)
valid_urls["label_0"] = 0
print(valid_urls.shape)
valid_urls.head()

(600, 2)


Unnamed: 0,0,label_0
0,chouftv.ma,0
1,india.com,0
2,google.cl,0
3,merdeka.com,0
4,yandex.com.tr,0


##### **Fake_urls Datasets**

In [282]:
df = pd.read_csv("fake_urls.csv",header=None)
fake_urls = df.sample(n = 600, random_state = 0)
fake_urls.reset_index(drop=True, inplace=True)
fake_urls["label_1"] = 1
print(fake_urls.shape)

(600, 2)


In [283]:
fake_urls[0] = fake_urls[0].str.replace("https://www.","",regex=True)
fake_urls.head()

Unnamed: 0,0,label_1
0,craigmorehatchery.co.za,1
1,davex-energy.com,1
2,pedwillfoundation.com,1
3,swissoffshoreintel.com,1
4,elagancedachshundpuppies.com,1


### Concating The Dataset

In [284]:
df_new = pd.concat([valid_urls,fake_urls],axis = 0,ignore_index=True)
df_new = df_new.rename(columns={df_new.columns[0]: "urls"})
df_new.head()

Unnamed: 0,urls,label_0,label_1
0,chouftv.ma,0.0,
1,india.com,0.0,
2,google.cl,0.0,
3,merdeka.com,0.0,
4,yandex.com.tr,0.0,


### Cleaning the Dataset

In [285]:
df_new = df_new.fillna(0)

In [286]:
df_new["labels"] = df_new["label_0"] + df_new["label_1"]

In [287]:
df_new = df_new.drop(columns=["label_0","label_1"],axis= 1)

In [288]:
df_new.head()

Unnamed: 0,urls,labels
0,chouftv.ma,0.0
1,india.com,0.0
2,google.cl,0.0
3,merdeka.com,0.0
4,yandex.com.tr,0.0


In [289]:
df_new.tail()

Unnamed: 0,urls,labels
1195,cutitesisabii.com,1.0
1196,batlantique-togo.com,1.0
1197,allantecuservices.com,1.0
1198,daleonservices.com,1.0
1199,markhowardpartners.com,1.0


In [292]:
df_new["labels"] = df_new["labels"].astype(int)

In [293]:
df_new["labels"].value_counts()

0    600
1    600
Name: labels, dtype: int64

In [295]:
df_new.head()

Unnamed: 0,urls,labels
0,chouftv.ma,0
1,india.com,0
2,google.cl,0
3,merdeka.com,0
4,yandex.com.tr,0


In [294]:
df_new.to_csv("malicious_code_predict.csv")