In [1]:
import pandas as pd
import csv
import http

In [16]:
# read the fetch_wsj.csv file using pandas
df = pd.read_csv("fetch_wsj.csv")
# count the number of rows for fetch statistics
fetches_attempted = len(df.index)
fetches_succeeded = df[df["Status"] < 300].shape[0]
fetches_failed = df[df["Status"] > 300].shape[0]
# group the rows by Status and count the number of URLs for each status code
status_codes = dict(df.groupby("Status").size())

In [17]:
with open("visit_wsj.csv", "r", encoding="UTF-8") as f:
    df = pd.read_csv(f, header=0)

total_urls_extracted = df['#Outlinks_found'].sum()

size_bins = [0, 1, 10, 100, 1000, float('inf')]
size_labels = ['less than 1KB', '1KB to less than 10KB', '10KB to less than 100KB', '100KB to less than 1MB', '1MB or greater']
df['Size Category'] = pd.cut(df['Size_KB'], bins=size_bins, labels=size_labels)

size_categories = df.groupby('Size Category').size().to_dict()

content_types = df.groupby('Content-Type').size().to_dict()

print("Total URLs Extracted:", total_urls_extracted)
print("Size Categories:", size_categories)
print("Content Types:", content_types)

Total URLs Extracted: 3520895
Size Categories: {'less than 1KB': 16, '1KB to less than 10KB': 160, '10KB to less than 100KB': 101, '100KB to less than 1MB': 11982, '1MB or greater': 823}
Content Types: {'image/jpeg': 10, 'image/png': 194, 'text/html': 12879}


In [24]:
with open("urls_wsj.csv", "r", encoding="UTF-8") as f:
    df = pd.read_csv(f, header=0)

unique_extracted = len(df)
unique_within = df['Status'].eq('OK').sum()
unique_outside = df['Status'].eq('N_OK').sum()

In [31]:

# Write statistics to file
with open("CrawlReport_wsj.txt", "w") as f:
    # Write general information
    f.write("Name: Reem Almijmaj\n")
    f.write("USC ID: 3217747723\n")
    f.write("News site crawled: wsj.com\n")
    f.write("Number of threads: 16\n\n")

    # Write outgoing URLs statistics
    f.write("Outgoing URLs:\n")
    f.write("==============\n")
    f.write(f"Total URLs extracted: {total_urls_extracted}\n")
    f.write(f"# unique URLs extracted: {df['URL'].nunique()}\n")
    f.write(f"# unique URLs within News Site: {df[df['is_news_site'] == True]['URL'].nunique()}\n")
    f.write(f"# unique URLs outside News Site: {df[df['is_news_site'] == False]['URL'].nunique()}\n\n")

    # Write file size statistics
    f.write("File Sizes:\n")
    f.write("===========\n")
    f.write("< 1KB: {}\n".format(size_categories.get('less than 1KB', 0)))
    f.write("1KB ~ <10KB: {}\n".format(size_categories.get('1KB to less than 10KB', 0)))
    f.write("10KB ~ <100KB: {}\n".format(size_categories.get('10KB to less than 100KB', 0)))
    f.write("100KB ~ <1MB: {}\n".format(size_categories.get('100KB to less than 1MB', 0)))
    f.write(">= 1MB: {}\n\n".format(size_categories.get('1MB or greater', 0)))

    # Write content types statistics
    f.write("Content Types:\n")
    f.write("==============\n")
    for content, count in sorted(content_types.items()):
        f.write("{}: {}\n".format(content, count))