# This notebook helps in parsing different types of lists available online, and creates a massive `conglom-labeled.csv` list at the end. The logic is complete here, but more lists could be added in the future. This work is done as of 5 APRIL 2024.

In [32]:
from dotenv import load_dotenv
import os
import requests
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import random
import plotly.express as px
import matplotlib.pyplot as plt
from urllib.parse import urlparse

First parsing yoyo.csv from https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=1&mimetype=plaintext 

In [33]:
df = pd.read_csv('./datasets/yoyo.csv')
df.columns = ['URL']
df['URL'] = df['URL'].str.replace('127.0.0.1 ', '', regex=False)
df['type'] = 'ads'
df = df.drop_duplicates()

Now add "ads" column for later merging with dataset and dropping duplicates

In [34]:
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3757 entries, 0 to 3756
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL     3757 non-null   object
 1   type    3757 non-null   object
dtypes: object(2)
memory usage: 58.8+ KB


Unnamed: 0,URL,type
count,3757,3757
unique,3757,1
top,101com.com,ads
freq,1,3757


In [36]:
df.to_csv('./datasets/yoyo-labeled.csv', index=False) 

Now EasyList https://easylist.to/easylist/easyprivacy.txt

In [43]:
# Step 1: Read the file line by line and filter based on your criteria
filtered_urls = []
with open('./datasets/easylist-tracking.csv', 'r') as file:
    for line in file:
        # Check if the line matches the desired pattern
        if line.startswith('||') and line.endswith('^\n'):
            # Extract the URL part and remove the trailing newline character
            url = line[2:-2]
            filtered_urls.append(url)

# Step 2: Convert the list of URLs to a DataFrame
import pandas as pd
df = pd.DataFrame(filtered_urls, columns=['URL'])
df['type'] = 'tracking'
df = df.drop_duplicates()
# Now 'df' contains only the URLs extracted from the file
print(df.head())

df.to_csv('./datasets/easylist-tracking-labeled.csv', index=False)
df.describe()

            URL      type
0      00px.net  tracking
1     1cros.net  tracking
2      2cnt.net  tracking
3  2l6ddsmnm.de  tracking
4  2smt6mfgo.de  tracking


Unnamed: 0,URL,type
count,39177,39177
unique,39177,1
top,00px.net,tracking
freq,1,39177


Ok now easylist ads https://ublockorigin.pages.dev/thirdparties/easylist.txt

In [44]:
filtered_urls = []
with open('./datasets/easylist-ads.csv', 'r') as file:
    for line in file:
        # pattern of ||example.com^
        if line.startswith('||') and line.endswith('^\n'):
            # extract URL
            url = line[2:-2]
            filtered_urls.append(url)

# convert to df
import pandas as pd
df = pd.DataFrame(filtered_urls, columns=['URL'])
df['type'] = 'ads'
df = df.drop_duplicates()
# Now 'df' contains only the URLs extracted from the file
print(df.head())

df.to_csv('./datasets/easylist-ads-labeled.csv', index=False)
df.describe()

              URL type
0  0008d6ba2e.com  ads
1  0024ad98dd.com  ads
2  0083334e84.com  ads
3  00d3ed994e.com  ads
4  00d84987c0.com  ads


Unnamed: 0,URL,type
count,38732,38732
unique,38732,1
top,0008d6ba2e.com,ads
freq,1,38732


Ok now the malware list from https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-ag-online.txt

In [49]:
filtered_urls = []
with open('./datasets/malware.csv', 'r') as file:
    for line in file:
        # pattern of ||example.com^
        if line.startswith('||') and line.endswith('$all\n'):
            # extract URL
            url = line[2:-5]
            filtered_urls.append(url)

# convert to df
import pandas as pd
df = pd.DataFrame(filtered_urls, columns=['URL'])
df['type'] = 'malware'
df = df.drop_duplicates()
print(df.head())

df.to_csv('./datasets/malware-labeled.csv', index=False)
df.describe()

                                          URL     type
0                             1ecosolution.it  malware
1  988skins.com/admin/view/stylesheet/50k.png  malware
2                            a.alie3ksgee.com  malware
3                            a0920080.xsph.ru  malware
4                   abissnet.net/se12y5vm.zip  malware


Unnamed: 0,URL,type
count,701,701
unique,701,1
top,1ecosolution.it,malware
freq,1,701


In [51]:
dfs = []
for filename in ['./datasets/easylist-ads-labeled.csv', #read each csv into df
                 './datasets/easylist-tracking-labeled.csv', 
                 './datasets/malicious-phish.csv', 
                 './datasets/malware-labeled.csv', 
                 './datasets/yoyo-labeled.csv']:
    df = pd.read_csv(filename, usecols=['URL', 'type'])  # ensure only URL and type are read
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv('./datasets/conglom-labeled.csv', index=False)
df_conglom = pd.read_csv('./datasets/conglom-labeled.csv')
print(df_conglom.head())

              URL type
0  0008d6ba2e.com  ads
1  0024ad98dd.com  ads
2  0083334e84.com  ads
3  00d3ed994e.com  ads
4  00d84987c0.com  ads


In [52]:
df_conglom.describe()
df_conglom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733558 entries, 0 to 733557
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     733558 non-null  object
 1   type    733558 non-null  object
dtypes: object(2)
memory usage: 11.2+ MB


In [53]:
print(df_conglom.describe())

                                                      URL    type
count                                              733558  733558
unique                                             722296       6
top     http://style.org.hc360.com/css/detail/mysite/s...  benign
freq                                                  180  428103


In [54]:
df_conglom.value_counts('type')

type
benign        428103
defacement     96457
phishing       94111
ads            42489
tracking       39177
malware        33221
Name: count, dtype: int64