In [1]:
print("!!! Importing Basic Libraries !!!")
 

import pandas as pd
import os 
from ftplib import FTP
from datetime import datetime
import re
import gzip


print("!!! Import Completed !!!")

!!! Importing Basic Libraries !!!
!!! Import Completed !!!


In [7]:
##============== Downloading the File ================##

# FTP server and file path
ftp_server = "ita.ee.lbl.gov"
ftp_path = "/traces/calgary_access_log.gz"
local_filename = "calgary_access_log.gz"



downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")
local_filename = os.path.join(downloads_dir, "calgary_access_log.gz")


# Connect to the FTP server
ftp = FTP(ftp_server)
ftp.login()

# Download the file
with open(local_filename, "wb") as f:
    ftp.retrbinary(f"RETR {ftp_path}", f.write)

ftp.quit()
print("Download complete: calgary_access_log.gz")
print(local_filename)


Download complete: calgary_access_log.gz
C:\Users\99248\Downloads\calgary_access_log.gz


In [9]:
# Load the plain text log file
with open(r'C:\Users\99248\Downloads\calgary_access_log (1)', 'rt', errors='ignore') as f:
    lines = f.readlines()


# Parse each line
entries = []
for line in lines:
    match = re.match(r'(\S+) \S+ \S+ \[(.*?)\] "GET (\S+) HTTP/\d\.\d" (\d{3}) (\S+)', line)
    if not match:
        continue
    host, timestamp_str, filename, http_code, byte_str = match.groups()
    try:
        timestamp = datetime.strptime(timestamp_str, '%d/%b/%Y:%H:%M:%S %z')
    except ValueError:
        continue
    bytes_sent = int(byte_str) if byte_str.isdigit() else None
    ext = filename.split('.')[-1] if '.' in filename else ''
    entries.append([host, timestamp, filename, http_code, bytes_sent, ext])

# Creating a DataFrame
df = pd.DataFrame(entries, columns=['host', 'timestamp', 'filename', 'http_code', 'bytes', 'file_extension'])

# Show the first few rows
print(df.head())
print(df.shape)
df.dtypes


    host                  timestamp    filename http_code    bytes  \
0  local  1994-10-24 13:41:41-06:00  index.html       200    150.0   
1  local  1994-10-24 13:41:41-06:00       1.gif       200   1210.0   
2  local  1994-10-24 13:43:13-06:00  index.html       200   3185.0   
3  local  1994-10-24 13:43:14-06:00       2.gif       200   2555.0   
4  local  1994-10-24 13:43:15-06:00       3.gif       200  36403.0   

  file_extension  
0           html  
1            gif  
2           html  
3            gif  
4            gif  
(721478, 6)


host               object
timestamp          object
filename           object
http_code          object
bytes             float64
file_extension     object
dtype: object

In [8]:
# Q1: Count of total log records
def total_log_records(df):
    return len(df)

# Q2: Count of unique hosts
def unique_hosts(df):
    return df['host'].nunique()

# Q3: Date-wise unique filename counts
def date_wise_unique_filenames(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    df['date'] = df['timestamp'].dt.strftime('%d-%b-%Y')
    return df.groupby('date')['filename'].nunique().to_dict()


# Q4: Number of 404 response codes
def count_404_responses(df):
    return df[df['http_code'] == '404'].shape[0]

# Q5: Top 15 filenames with 404 responses
def top_15_filenames_404(df):
    return df[df['http_code'] == '404']['filename'].value_counts().head(15).to_dict()

# Q6: Top 15 file extensions with 404 responses
def top_15_extensions_404(df):
    return df[df['http_code'] == '404']['file_extension'].value_counts().head(15).to_dict()

# Q7: Total bandwidth transferred per day for July 1995
def bandwidth_per_day_july_1995(df):
    df_july_1995 = df[(df['timestamp'].dt.month == 7) & (df['timestamp'].dt.year == 1995)]
    df_july_1995 = df_july_1995.dropna(subset=['bytes'])
    df_july_1995['date'] = df_july_1995['timestamp'].dt.strftime('%d-%b-%Y')
    return df_july_1995.groupby('date')['bytes'].sum().to_dict()

# Q8: Hourly request distribution
def hourly_request_distribution(df):
    df['hour'] = df['timestamp'].dt.hour
    return df['hour'].value_counts().sort_index().to_dict()

# Q9: Top 10 most requested filenames
def top_10_filenames(df):
    return df['filename'].value_counts().head(10).to_dict()

# Q10: HTTP response code distribution
def http_code_distribution(df):
    return df['http_code'].value_counts().to_dict()

# Execute the functions and print the results
print("Q1: Total log records:",total_log_records(df))
print("Q2: Unique hosts:", unique_hosts(df))
print("Q3: Date-wise unique filename counts:", date_wise_unique_filenames(df))
print("Q4: Number of 404 response codes:", count_404_responses(df))
print("Q5: Top 15 filenames with 404 responses:", top_15_filenames_404(df))
print("Q6: Top 15 file extensions with 404 responses:", top_15_extensions_404(df))
print("Q7: Total bandwidth transferred per day for July 1995:", bandwidth_per_day_july_1995(df))
print("Q8: Hourly request distribution:", hourly_request_distribution(df))
print("Q9: Top 10 most requested filenames:", top_10_filenames(df))
print("Q10: HTTP response code distribution:", http_code_distribution(df))


Q1: Total log records: 721478
Q2: Unique hosts: 2
Q3: Date-wise unique filename counts: {'01-Apr-1995': 407, '01-Aug-1995': 663, '01-Dec-1994': 243, '01-Feb-1995': 571, '01-Jan-1995': 82, '01-Jul-1995': 405, '01-Jun-1995': 569, '01-Mar-1995': 519, '01-May-1995': 432, '01-Nov-1994': 436, '01-Oct-1995': 582, '01-Sep-1995': 445, '02-Apr-1995': 392, '02-Aug-1995': 785, '02-Dec-1994': 355, '02-Feb-1995': 619, '02-Jan-1995': 128, '02-Jul-1995': 362, '02-Jun-1995': 546, '02-Mar-1995': 676, '02-May-1995': 693, '02-Nov-1994': 422, '02-Oct-1995': 839, '02-Sep-1995': 306, '03-Apr-1995': 813, '03-Aug-1995': 659, '03-Dec-1994': 151, '03-Feb-1995': 566, '03-Jan-1995': 257, '03-Jul-1995': 460, '03-Jun-1995': 396, '03-Mar-1995': 502, '03-May-1995': 565, '03-Nov-1994': 450, '03-Oct-1995': 833, '03-Sep-1995': 207, '04-Apr-1995': 862, '04-Aug-1995': 739, '04-Dec-1994': 208, '04-Feb-1995': 450, '04-Jan-1995': 313, '04-Jul-1995': 490, '04-Jun-1995': 323, '04-Mar-1995': 364, '04-May-1995': 700, '04-Nov-1994

Q9: Top 10 most requested filenames: {'index.html': 139179, '3.gif': 24001, '2.gif': 23590, '4.gif': 8014, '244.gif': 5147, '5.html': 5004, '4097.gif': 4874, '8870.jpg': 4492, '6733.gif': 4278, '8472.gif': 3843}
Q10: HTTP response code distribution: {'200': 565511, '304': 97560, '302': 30221, '404': 23348, '403': 4737, '401': 46, '500': 42, '400': 13}
