In [None]:
# Find ranks of alexa 100k domains with their A records (IP address/es) and save it into a file.
import csv
import pandas as pd
year = "2020"
mon = "jan"
day = "01"

# 1. Read alexa_domain_a_df that has domain name and ranks: openintel_domain_a_record
# 2. Join it with alexa_domains_with_rank
# File alexa_domains_with_rank_<year>.csv is downloaded from DACS object storage
# traces/Feeds/<year>/Alexa/Alexa_<year>-12-31.tar.gz

df1_orig = pd.read_csv("../data/alexa-1m/alexa_domains_with_rank_"+day+"_"+mon+"_"+year+".csv")
df1 = df1_orig[0:100000] # Choose only 100k based on https://attentioninsight.com/what-is-alexa-rank-and-its-value/
df2 = pd.read_csv("../data/alexa-1m/openintel_alexa_domain_a_record_"+day+"_"+mon+"_"+year+".csv")
df3 = df1.merge(df2, how='left', on='domain')

df3.to_csv("../data/alexa-1m/openintel_alexa_resolved_with_rank_100k_"+day+"_"+mon+"_"+year+".csv", index=False)

In [None]:
# Find ranks of alexa 100k domains with their A records (IP address/es) and save it into a file.
# After TMA. Note: Code ran in Aruba.
import tarfile
import csv
import pandas as pd
import io

year = "2022"
mon = "dec"
day = "01"

# 1. Read alexa_domain_a_df that has domain name and ranks: openintel_domain_a_record
# 2. Join it with alexa_domains_with_rank
# File alexa_domains_with_rank_<year>.csv is downloaded from DACS object storage
# traces/Feeds/<year>/Alexa/Alexa_<year>-12-31.tar.gz

# Path to your .tar.gz file
archive_path = '../data/alexa-1m/alexa_domains_with_rank_01_jun_2022.tar.gz'

# Open the archive
with tarfile.open(archive_path, 'r:gz') as tar:
    # List all files in the archive
    for member in tar.getmembers():
        if member.name.endswith('.csv'):
            # Extract the CSV file into memory
            file = tar.extractfile(member)
            if file is not None:
                # Assign column names directly while reading
                df1_orig = pd.read_csv(io.TextIOWrapper(file, encoding='utf-8'), names=["rank", "domain"], header=None)

df1 = df1_orig[0:100000] # Choose only 100k based on https://attentioninsight.com/what-is-alexa-rank-and-its-value/
df2 = pd.read_csv("../data/alexa-1m/openintel_alexa_domain_a_record_"+day+"_"+mon+"_"+year+".csv")
df3 = df1.merge(df2, how='left', on='domain')
df3.to_csv("../data/alexa-1m/openintel_alexa_resolved_with_rank_100k_"+day+"_"+mon+"_"+year+".csv", index=False)

In [None]:
# Find the number of matched IP addresses 

# Compare the ip addresses with the protected prefixes using pytricia loop
import pandas as pd
import ipaddress
import pytricia


year = "2021"
mon = "jan"
day = "01"


def build_prefix_trees(prefix_list):
    """Builds two Patricia Tries: one for IPv4 and one for IPv6."""
    pt_v4 = pytricia.PyTricia()
    pt_v6 = pytricia.PyTricia()

    for prefix in prefix_list:
        network = ipaddress.ip_network(prefix)
        if network.version == 4:

            pt_v4[prefix] = True
        else:
            pt_v6[prefix] = True

    return pt_v4, pt_v6

def count_covered_ips(ip_list, pt_v4, pt_v6):
    """Counts how many IPs are covered by the prefixes in the Patricia Tries."""
    count = 0
    for ip in ip_list:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.version == 4 and pt_v4.get(ip):
            count += 1
        elif ip_obj.version == 6 and pt_v6.get(ip):
            count += 1

    return count


def get_covered_ips(ip_list, pt_v4, pt_v6):
    """Returns the list of IPs covered by prefixes in the Patricia Trie."""
    covered_ips = []
    for ip in ip_list:
        ip_obj = ipaddress.ip_address(ip)
        if ip_obj.version == 4 and pt_v4.get(ip):
            covered_ips.append(ip)
        elif ip_obj.version == 6 and pt_v6.get(ip):
            covered_ips.append(ip)
#     covered_ips = [ip for ip in ip_list if prefix_tree.get(ip)]
    return covered_ips



# Protected prefixes lists using all five scrubberscovered_count
df = pd.read_csv("../data/after_tma/customers_prefixes_scrubber_all_"+day+"_"+mon+"_"+year+".csv") 
protected_prefixes = df["prefix"].tolist()

# Openintel list of IP addresses
df = pd.read_csv("../data/after_tma/openintel_alexa_resolved_with_rank_100k_"+day+"_"+mon+"_"+year+".csv") 
df_cleaned = df.dropna()

alexa_ip_addresses = df_cleaned["ip4_address"].unique()

# Build separate prefix trees for IPv4 and IPv6
pt_v4, pt_v6 = build_prefix_trees(protected_prefixes)

# Find the number of IPs covered
covered_count = count_covered_ips(alexa_ip_addresses, pt_v4, pt_v6)

# Find the IPs covered
covered_ips = get_covered_ips(alexa_ip_addresses, pt_v4, pt_v6)


# Save covered ips in a .txt file.
with open("../data/after_tma/openintel_alexa_scrubber_covered_ip_100k_"+day+"_"+mon+"_"+year+".txt", "w", encoding="utf-8") as f:
    for ip in covered_ips:
        f.write(f"{ip}\n")
print(f"Number of IPs protected by the five scrubber's protected prefixes in {day} {mon} {year} is: {len(covered_ips)} and IPs saved in openintel_alexa_scrubber_covered_ip_100k_"+year+".txt")

In [None]:
# Find ranks of tranco 1M domains with their A records (IP address/es) and save it into a file.
# After TMA
import csv
import pandas as pd

year = "2020"
mon = "jan"
day = "01"

covered_ips = []
# Parse massdns results
with open("../data/after_tma/openintel_alexa_scrubber_covered_ip_100k_"+day+"_"+mon+"_"+year+".txt", "r", encoding="utf-8") as f:
    for line in f:
        ip = line.strip("\n")
        covered_ips.append(ip)
        
# Convert it into a dataframe with column name ipv4_address
df1 = pd.DataFrame(covered_ips, columns=['ip4_address'])

df2 = pd.read_csv("../data/after_tma/openintel_alexa_resolved_with_rank_100k_"+day+"_"+mon+"_"+year+".csv")

# df3 = df2.merge(df1, how='inner', on='ip4_address')
        
    
result = df1.merge(df2, on='ip4_address', how='inner')
result.to_csv("../data/after_tma/openintel_alexa_ip_domains_ranks_100k_"+day+"_"+mon+"_"+year+".csv", index=False)

print("%s number of domains are protected in %s %s %s. \n" %(len(result), day, mon, year))


In [None]:
result

In [None]:
# Find ranks of tranco 1M domains with their A records (IP address/es) and save it into a file.
import csv


covered_ips = []
# Parse massdns results
with open("../data/alexa-1m/openintel_alexa_scrubber_covered_ip_100k_"+year+".txt", "r", encoding="utf-8") as f:
    for line in f:
        ip = line.strip("\n")
        covered_ips.append(ip)
        
# Convert it into a dataframe with column name ipv4_address
df1 = pd.DataFrame(covered_ips, columns=['ip4_address'])

df2 = pd.read_csv("../data/alexa-1m/openintel_alexa_resolved_with_rank_100k_"+year+".csv")

# df3 = df2.merge(df1, how='inner', on='ip4_address')
        
    
result = df1.merge(df2, on='ip4_address', how='inner')
result.to_csv("../data/alexa-1m/openintel_alexa_ip_domains_ranks_100k_"+year+".csv", index=False)

print("%s number of domains are protected. \n" %len(result))


In [None]:
import pandas as pd
df = pd.read_csv("../data/alexa-1m/openintel_alexa_ip_domains_ranks_100k_2022.csv")

In [None]:
# After TMA
import pandas as pd
df = pd.read_csv("../data/after_tma/openintel_alexa_ip_domains_ranks_100k_01_jan_2020.csv")

In [None]:
df