# 1. Import packages

In [1]:
import copy
import numpy as np
import os
import pandas as pd
import time

from datetime import timedelta

# 2. Read a sample CSV

## 2.1. Change to inputs directory

In [2]:
print(os.getcwd())
input_path_s = "/input/"
input_path_s = os.getcwd() + input_path_s
output_path_s = "/output/"
output_path_s = os.getcwd() + output_path_s
os.chdir(input_path_s)
print(os.getcwd())

/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task16_easylist
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task16_easylist/input


## 2.2. Read CSV: 2022-08-09.csv

In [3]:
csv_df = pd.read_csv("2022-08-09.csv")

In [4]:
csv_df["initial_destination_IP_or_hostname"]

0          p1cluster.cxense.com
1          p1cluster.cxense.com
2          p1cluster.cxense.com
3          p1cluster.cxense.com
4          p1cluster.cxense.com
                  ...          
56570    accounts.pinterest.com
56571    accounts.pinterest.com
56572    accounts.pinterest.com
56573    accounts.pinterest.com
56574    accounts.pinterest.com
Name: initial_destination_IP_or_hostname, Length: 56575, dtype: object

In [5]:
init_dest_l = csv_df["initial_destination_IP_or_hostname"].values.tolist()

## 2.3. Read TXT: measurement_ids_kicho.txt

In [6]:
measurement_df = pd.read_csv("measurement_ids_kicho.txt", on_bad_lines="skip", header=None)

In [7]:
measurement_df

Unnamed: 0,0,1,2,3
0,de,39835,www.instagram.com,42506263.0
1,de,39835,twitter.com,42506264.0
2,de,39835,s0.2mdn.net,42506265.0
3,de,39835,www.clarity.ms,42506266.0
4,de,39835,www.filmix.ac,42506267.0
...,...,...,...,...
494256,cz,44489,cdn.speedcurve.com,43832951.0
494257,cz,44489,img.kurzy.cz,43832953.0
494258,cz,44489,report.prozeny.cz,43832954.0
494259,cz,44489,pro.mapy.cz,43832955.0


In [8]:
measurement_l = measurement_df[2].values.tolist()

## 2.4. Read TXT: easyprivacy.txt

In [9]:
easyprivacy_df = pd.read_csv("easyprivacy.txt", on_bad_lines="skip")

In [10]:
easyprivacy_df

Unnamed: 0,[Adblock Plus 1.1]
0,! Version: 202208230442
1,! Title: EasyPrivacy
2,! Last modified: 23 Aug 2022 04:42 UTC
3,! Expires: 4 days (update frequency)
4,! Homepage: https://easylist.to/
...,...
27046,! ---------- Vietnamese ----------
27047,!
27048,!
27049,! ---------- Anti-Adblock ----------


# 3. Create a parser list

In [11]:
easyprivacy_l = easyprivacy_df["[Adblock Plus 1.1]"].values.tolist()

In [12]:
parser_l = [parser for parser in easyprivacy_l if parser[0] != "!"]

# 4. Segmentization

## 4.1. Function

In [13]:
def show_segment(hostname_l):
    '''
    parameter: hostname list
    output: a list of segments from a hostname
    does: Parse from the second last dot to the end.
        For example, "a.thumbs.redditmedia.com" becomes "redditmedia.com"
    '''
    segment_l = []
    for hostname in hostname_l:
        dot_count_i = hostname.count(".")
        if dot_count_i <= 2:
            segment_l.append(hostname)
        else:
            last_dot_index_i = hostname.rfind(".")
            second_last_dot_index_i = hostname[:last_dot_index_i].rfind(".")
            segment_l.append(hostname[second_last_dot_index_i+1:])
    return segment_l

## 4.2. Segment List

In [14]:
segment_csv_l = show_segment(init_dest_l)
segment_txt_l = show_segment(measurement_l)

In [15]:
print(len(init_dest_l))
print(len(segment_csv_l))

56575
56575


In [16]:
print(len(measurement_l))
print(len(segment_txt_l))

494261
494261


# 5. Parse: Pass to the parser to determine if they are a tracker or not.

## 5.1. CSV: 2022-08-09.csv

In [17]:
start_time = time.time()
is_tracker_csv_l = []
for segment in segment_csv_l:
    is_tracker = False
    for parser in parser_l:
        if segment in parser:
            is_tracker = True
    is_tracker_csv_l.append(is_tracker)
elapsed_time = time.time() - start_time
print(str(timedelta(seconds=elapsed_time)))

0:01:28.643251


In [18]:
print(len(segment_csv_l))
print(len(is_tracker_csv_l))
print(np.count_nonzero(np.array(is_tracker_csv_l)))
print(f"{round(np.count_nonzero(np.array(is_tracker_csv_l)) / len(is_tracker_csv_l) * 100, 5)}% are trackers")

56575
56575
11968
21.15422% are trackers


## 5.2. TXT: measurement_ids_kicho.txt

In [19]:
start_time = time.time()
is_tracker_txt_l = []
for segment in segment_txt_l:
    is_tracker = False
    for parser in parser_l:
        if segment in parser:
            is_tracker = True
    is_tracker_txt_l.append(is_tracker)
elapsed_time = time.time() - start_time
print(str(timedelta(seconds=elapsed_time)))

0:12:39.969826


In [20]:
print(len(segment_txt_l))
print(len(is_tracker_txt_l))
print(np.count_nonzero(np.array(is_tracker_txt_l)))
print(f"{round(np.count_nonzero(np.array(is_tracker_txt_l)) / len(is_tracker_txt_l) * 100, 5)}% are trackers")

494261
494261
93033
18.82265% are trackers


# 6. Save

In [27]:
os.chdir(output_path_s)

## 6.1. CSV: 2022-08-09.csv to 2022-08-09-tracker.csv

In [22]:
original_csv_df = copy.deepcopy(csv_df)

In [24]:
csv_df['is_tracker'] = is_tracker_csv_l

In [26]:
csv_df.to_csv(f"2022-08-09-tracker.csv")

## 6.2. TXT: measurement_ids_kicho_tracker.txt

In [28]:
measurement_df.columns

Int64Index([0, 1, 2, 3], dtype='int64')

In [29]:
original_measurement_df = copy.deepcopy(measurement_df)

In [30]:
measurement_df[4] = is_tracker_txt_l

In [None]:
measurement_df.to_csv(f"measurement_ids_kicho_tracker.txt", header=None)