Creation of first dataset

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
#Read safe domains (from TRANCO list)
dom_safe = pd.read_csv('https://raw.githubusercontent.com/polmoya/tfmUOC/main/tranco_4KV6X.csv', sep=",", header=None, nrows=300000)
dom_safe.columns = ["index","dom_name"]
#Read phishing domains
dom_phishing = pd.read_csv('https://raw.githubusercontent.com/polmoya/tfmUOC/main/phishing.csv', sep=",", header=None, error_bad_lines=False, nrows=150000)
dom_phishing.columns = ["dom_name", "url", "2", "ip", "4", "5"]
#Read subrl domains
dom_surbl = pd.read_csv('https://raw.githubusercontent.com/polmoya/tfmUOC/main/surbl.csv', sep=",", header=None, nrows=150000)
dom_surbl.columns = ["dom_name", "ip"]
#Read spam domains
dom_spam = pd.read_csv('https://raw.githubusercontent.com/polmoya/tfmUOC/main/spam_clean.txt', sep=" ", header=None, nrows=150000)
dom_spam.columns = ["dom_name"]



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 21175: expected 6 fields, saw 7\n'


In [None]:
#Function to split the domain, into tld, sld and third. Also preprocess dataset
#Input: dataframe
#Output: dataframe (clean)
def split_clean_domain(data):
  #Add '.' if there's only one '.'. To avoid errors at spliting sld.
  data['dom_name_aux'] = data.dom_name.apply(lambda x: '.' + x if len(re.findall('[.]', x)) <= 1 else x)
  #Split domain into tld, sld and thirdld
  data[['sld', 'tld']] = data['dom_name_aux'].str.rsplit('.', 1, expand=True)
  data[['thirdld', 'sld']] = data['sld'].str.split('.', 1, expand=True)
  data.loc[data['sld'].isnull(), 'aux'] = data['thirdld']
  data.loc[data['sld'].isnull(), 'thirdld'] = data['sld']
  data.loc[data['sld'].isnull(), 'sld'] = data['aux']
  data['sld'] = data['sld'].str.lower()
  #Clean sld for the one_hot_encoding
  data['sld_clean'] = data['sld'].str.replace(r'[^-0-9a-zA-Z]', '#', regex=True)
  data['sld_clean'] = data['sld_clean'].str.pad(63, side='left', fillchar='#')
  #Delete not needed columns
  data.pop('dom_name_aux')
  data.pop('aux')
  data.pop('thirdld')
  data.pop('tld')
  #Delete rows with sld not valid
  data = data[data['sld_clean'].str.len() <= 63]
  return

In [None]:
#Add test column. 1 if domain is dangerous, 0 if domain is safe
#Input: datframe
#Output: dataframe with column perillos set to 1 or 0.
def add_tag_column(data, dangerous):
  data["dangerous"] = dangerous

In [None]:
#Clean safe domains
dom_safe.pop('index') #delete column not needed
split_clean_domain(dom_safe)
dom_safe = dom_safe[dom_safe['sld_clean'].str.len() <= 63] #delete slds > 63
#Clean phishing domains
#Deleted columns not needed
dom_phishing.pop("url")
dom_phishing.pop("2")
dom_phishing.pop("ip")
dom_phishing.pop("4")
dom_phishing.pop("5")
split_clean_domain(dom_phishing)
dom_phishing = dom_phishing[dom_phishing['sld_clean'].str.len() <= 63] #delete slds > 63
#Clean surbl domains
dom_surbl.pop('ip') #delete column not needed
split_clean_domain(dom_surbl)
dom_surbl = dom_surbl[dom_surbl['sld_clean'].str.len() <= 63] #delete slds > 63
#Clean spam domains
split_clean_domain(dom_spam)
dom_spam = dom_spam[dom_spam['sld_clean'].str.len() <= 63] #delete slds > 63

In [None]:
#Concat df with dangerous domains
dom_malicious = pd.concat([dom_phishing, dom_surbl, dom_spam], ignore_index=True)
print(dom_phishing.shape[0]+ dom_surbl.shape[0]+dom_spam.shape[0])
print(dom_malicious.shape[0])

349606
349606


In [None]:
#Function that removes duplicates from dom_malicious and removes dom_malicious slds also present on dom_safe.
#Input: dom_malicious(dataframe with the malicious domains), dom_safe(dataframe with the safe domains) both have to be preprocessed.
#Output: dataframe without any domain present on the dom_safe dataframe.
def clean_dom_malicious(dom_malicious, dom_safe):
  dom_malicious = dom_malicious.drop_duplicates(subset=['sld'])
  df_mal_clean = dom_malicious.merge(dom_safe.drop_duplicates(), on=['sld_clean','sld_clean'], how='left', indicator=True).query('_merge=="left_only"')
  df_mal_clean.pop('dom_name_y')
  df_mal_clean.pop('sld_y')
  df_mal_clean.pop('_merge')
  df_mal_clean = df_mal_clean.rename(columns={"dom_name_x": "dom_name", "sld_x": "sld", "sld_clean": "sld_clean"})
  return df_mal_clean

In [None]:
#Remove duplicates from dom_perillosos
dom_malicious = clean_dom_malicious(dom_malicious, dom_safe)
#Check number rows
print(dom_malicious.shape[0])

274628


In [None]:
dom_malicious.head()

Unnamed: 0,dom_name,sld,sld_clean
1,petranorris.com,petranorris,##############################################...
2,liceosanroque.cl,liceosanroque,##############################################...
3,primeone.org,primeone,##############################################...
4,testhosteur.ecorcepower.com,ecorcepower,##############################################...
5,winkwhitecollagen.com,winkwhitecollagen,##############################################...


In [None]:
#Add label to the dataframes
add_tag_column(dom_safe, 0)
add_tag_column(dom_malicious, 1)

In [None]:
#Concat tagged domains
dom_tagged = pd.concat([dom_safe, dom_malicious], ignore_index=True)
dom_tagged.head()

Unnamed: 0,dom_name,sld,sld_clean,dangerous
0,google.com,google,##############################################...,0
1,gtld-servers.net,gtld-servers,##############################################...,0
2,youtube.com,youtube,##############################################...,0
3,facebook.com,facebook,##############################################...,0
4,akamaiedge.net,akamaiedge,##############################################...,0


In [None]:
dom_tagged.to_csv("clean_data.csv", index=False)

Creation of second dataset

In [None]:
#Read spam domains
dom_malicious_1 = pd.read_csv('https://raw.githubusercontent.com/polmoya/tfmUOC/main/malicious_dom_1.csv', sep="|")

In [None]:
dom_malicious_1.head()

Unnamed: 0,fqdn,type,registrar,regid
0,02298.bond,spam,Gname.com Pte. Ltd.,1923.0
1,07k.top,spam,Namecheap Inc.,1068.0
2,0dscnfij.cn,spam,,
3,0fu10.bond,spam,Gname.com Pte. Ltd.,1923.0
4,0pzwb.bond,spam,Gname.com Pte. Ltd.,1923.0


In [None]:
#Clean domain
dom_malicious_1.pop("type")
dom_malicious_1.pop("registrar")
dom_malicious_1.pop("regid")
dom_malicious_1 = dom_malicious_1.rename(columns={"fqdn": "dom_name"})
dom_malicious_1.head()
split_clean_domain(dom_malicious_1)
dom_malicious_1 = dom_malicious_1[dom_malicious_1['sld_clean'].str.len() <= 63] #delete slds > 63

In [None]:
#Check number rows
print(dom_malicious_1.shape[0])

408452


In [None]:
#Remove duplicates from dom_perillosos
dom_malicious_1 = clean_dom_malicious(dom_malicious_1, dom_safe)
#Check number rows
print(dom_malicious_1.shape[0])

343730


In [None]:
#Add label to the dataframes
add_tag_column(dom_malicious_1, 1)

In [None]:
#Concat tagged domains
dom_tagged_1 = pd.concat([dom_safe, dom_malicious_1], ignore_index=True)
dom_tagged_1.head()

Unnamed: 0,dom_name,sld,sld_clean,dangerous
0,google.com,google,##############################################...,0
1,gtld-servers.net,gtld-servers,##############################################...,0
2,youtube.com,youtube,##############################################...,0
3,facebook.com,facebook,##############################################...,0
4,akamaiedge.net,akamaiedge,##############################################...,0


In [None]:
dom_tagged_1.head()

Unnamed: 0,dom_name,sld,sld_clean,dangerous
0,google.com,google,##############################################...,0
1,gtld-servers.net,gtld-servers,##############################################...,0
2,youtube.com,youtube,##############################################...,0
3,facebook.com,facebook,##############################################...,0
4,akamaiedge.net,akamaiedge,##############################################...,0


In [None]:
dom_tagged_1.to_csv("clean_data_1.csv", index=False)