# 1. Import packages

In [1]:
import glob
import json
import os
import pandas as pd

# 2. Read a txt file

## 2.1. Change to inputs directory

In [2]:
print(os.getcwd())
input_path_s = "/inputs/logs/"
input_path_s = os.getcwd() + input_path_s
output_path_s = "/outputs/logs/"
output_path_s = os.getcwd() + output_path_s
os.chdir(input_path_s)
print(os.getcwd())

/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/inputs/logs


## 2.2. Country List

In [3]:
country_l = next(os.walk('.'))[1]

# 3. Functions

## 3.1. Read all txt log files and collect them in a dictionary

In [4]:
def produce_log_dict():
    '''
    parameter: none
    output: log dictionary
    does: Read all txt log files and collect them in a dictionary
    '''
    log_d = dict()

    for file in glob.glob("*.txt"):

        # Get the txt filename without the extension
        txt_filename = os.path.splitext(file)[0]

        if txt_filename[:1].isdigit():

            # Parse AS number from txt_filename
            dot_loc_i = txt_filename.find(".")
            as_number_s = txt_filename[:dot_loc_i]

            # If there is a number at the end of the filename, then remove.
            if txt_filename[-1].isdigit():
                txt_filename = txt_filename[:-1]

            # Try whether txt_filename is in the dictionary
            # If not, then assign into the dictionary
            try:
                log_d[txt_filename]
                print(f"DUPLICATE LOG FILE: {txt_filename}")
            except:
                if os.path.getsize(file) > 500:
                    log_d[txt_filename] = pd.read_csv(file, sep="\n", header=None, on_bad_lines='skip')
                else:
                    log_d[txt_filename] = pd.DataFrame()

    print(f"The length of log is: {len(log_d)}")
    return log_d

## 3.2. Count Request URL: FQDN

In [5]:
def produce_fqdn(request_series):
    '''
    parameter: Series of request queries
    output: A set of FQDN and is_error binary file
    does:
    '''
    fqdn_set = set()
    is_error = False
    # Loop through each request query
    for req in request_series:
        # If a request query is reached, then the it should start with "{'request url':"
        if req[:15] == "{'request url':":
            slash_loc_i = req.find("/")
            comma_loc_i = req.find(",")
            url = req[slash_loc_i+2:comma_loc_i-1]

            slash_loc_i = url.find("/")
            if slash_loc_i > 0:
                url = url[:slash_loc_i]

            question_mark_loc_i = url.find("?")
            if question_mark_loc_i > 0:
                url = url[:question_mark_loc_i]

            if "." in url:
                fqdn_set.add(url)
        else:
            # Check whether a request inquery is <unknown or not>
            if req[-9:] == "<unknown>":
                is_error = True
            
    return fqdn_set, is_error

## 3.3. Collect and Run

In [6]:
def collect_and_run():
    '''
    parameter: Series of request queries
    output: A set of FQDN and is_error binary file
    does:
    '''
    log_d = produce_log_dict()
    country_l = []
    domain_l = []
    as_number_l = []
    n_fqdn_response_l = []
    is_error_l = []
    is_empty_l = []
    
    for log_name, resp_df in log_d.items():
        # Parse country, target domain, and AS number based on log_name
        dot_loc_i = log_name.find(".")
        as_number_i = int(log_name[:dot_loc_i])
        country_s = log_name[dot_loc_i+1:dot_loc_i+3]
        domain_s = log_name[dot_loc_i+4:]

        if resp_df.empty:
            fqdn_set = set()
            is_empty = True
            is_error = False
        else:
            is_empty = False
            # Please note that the column name in DataFrame is 0.
            if len(resp_df[0]) > 1:
                # Squeeze multiple Series into a Series
                resp_ser = resp_df[0].squeeze()
                fqdn_set, is_error = produce_fqdn(resp_ser)
            else:
                fqdn_set, is_error = produce_fqdn(resp_df[0])

        # Collect into lists
        country_l.append(country_s)
        domain_l.append(domain_s)
        as_number_l.append(as_number_i)
        n_fqdn_response_l.append(len(fqdn_set))
        is_error_l.append(is_error)
        is_empty_l.append(is_empty)
    
    return country_l, domain_l, as_number_l, n_fqdn_response_l, is_error_l, is_empty_l

## 3.4. Save

In [7]:
def save(country, country_l, domain_l, as_number_l, n_fqdn_response_l, is_error_l, is_empty_l):
    '''
    parameter: country string
    output: Save as CSV
    does: Change directory to the right country one and then save DF as CSV
    '''
    col_l = ["country", "target_domain", "as_number", "#fqdn_in_response", "is_error", "is_empty"]
    df = pd.DataFrame(list(zip(country_l, domain_l, as_number_l, n_fqdn_response_l, is_error_l, is_empty_l)), columns = col_l)
    
    country_output_path_s = output_path_s + country
    is_path_exist = os.path.exists(country_output_path_s)
    
    if not is_path_exist:
        # Create a new directory because it does not exist 
        os.makedirs(country_output_path_s)
        print("The new directory is created!")
    
    os.chdir(country_output_path_s)
    print(os.getcwd())
    
    df.to_csv(f"{country}.csv", index=False)

# 4. Run

In [9]:
for country in country_l:
    print(f"=== === === {country} === === ===")
    country_input_path_s = input_path_s + country
    os.chdir(country_input_path_s)
    print(os.getcwd())
   
    country_l, domain_l, as_number_l, n_fqdn_response_l, is_error_l, is_empty_l = collect_and_run()
    save(country, country_l, domain_l, as_number_l, n_fqdn_response_l, is_error_l, is_empty_l)

=== === === cz === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/inputs/logs/cz
The length of log is: 5200
The new directory is created!
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/cz


# 5. Statistics

In [16]:
concise_country_l = []
response_rate_l = []
error_rate_l = []
empty_rate_l = []
n_query_l = []
n_response_l = []

In [17]:
for country in country_l:
    print(f"=== === === {country} === === ===")
    country_output_path_s = output_path_s + country
    os.chdir(country_output_path_s)
    print(os.getcwd())
    
    df = pd.read_csv(f"{country}.csv")

    is_response_i = sum(df["#fqdn_in_response"] != 0)
    is_not_response_i = sum(df["#fqdn_in_response"] == 0)
    is_error_i = sum(df["is_error"] == True)
    is_not_error_i = sum(df["is_error"] == False)
    is_empty_i = sum(df["is_empty"] == True)
    is_not_empty_i = sum(df["is_empty"] == False)
    n_query_i = len(df["country"])
    
    if is_response_i + is_not_response_i != n_query_i:
        print("response row count is incorrect!")
    if is_error_i + is_not_error_i != n_query_i:
        print("error row count is incorrect!")
    if is_empty_i + is_not_empty_i != n_query_i:
        print("empty row count is incorrect!")

    concise_country_l.append(country)
    n_query_l.append(n_query_i)
    n_response_l.append(sum(df["#fqdn_in_response"]))
    if n_query_i == 0:
        response_rate_l.append("")
        error_rate_l.append("")
        empty_rate_l.append("")
    else:
        response_rate_l.append("{:.5f}".format(is_response_i / n_query_i * 100) + " %")
        error_rate_l.append("{:.5f}".format(is_error_i / n_query_i * 100) + " %")
        empty_rate_l.append("{:.5f}".format(is_empty_i / n_query_i * 100) + " %")

=== === === sk === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/sk
=== === === se === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/se
=== === === pl === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/pl
=== === === mt === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/mt
=== === === be === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/be
=== === === gr === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/gr
=== === === ee === === ===
/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs/ee
=== === === lv === === ===
/Users/immanuel/Docum

In [18]:
col_l = ["country", "response_rate", "error_rate", "empty_rate", "number_of_queries", "number_of_responses"]
df = pd.DataFrame(list(zip(concise_country_l, response_rate_l, error_rate_l, empty_rate_l, n_query_l, n_response_l)), columns = col_l)

os.chdir(output_path_s)
print(os.getcwd())

df.to_csv(f"statistics.csv", index=False)

/Users/immanuel/Documents/NEU/05_Spring_2022/CS7675_ResAppr/tasks/MyLab/task15_EU_BrightData/outputs/logs
