In [None]:
import pandas as pd
import tqdm.notebook as tqdm
from pandarallel import pandarallel

In [None]:
import glob

rankings = pd.concat(
    (pd.read_csv(
        path,
        header=None,
        names=["pageURL", "pageRank", "avgDuration"],
    )
    for path in glob.glob("data/AMPLab/1node/rankings/*.csv")),
    axis=0,
    ignore_index=True, 
)

In [None]:
rankings.info()

In [None]:
query_1 = rankings.loc[rankings.pageRank > 100]
len(query_1)

In [None]:
uservisits = pd.concat(
    (pd.read_csv(
        path,
        header=None,
        names=["sourceIP", "destinationURL", "visitDate", "adRevenue", "UserAgent", "cCode", "lCode", "searchWord", "duration"],
        parse_dates=["visitDate"],
    )
    for path in glob.glob("data/AMPLab/1node/uservisits/*.csv")[:200]),
    axis=0,
    ignore_index=True, 
)

In [None]:
%%time
query_2 = (
uservisits
    .groupby(uservisits.sourceIP.str.slice(0, 7))
    .adRevenue
    .sum()
)
query_2

In [None]:
pandarallel.initialize(progress_bar=True)

In [None]:
%%time

query_2 = (
uservisits
    .parallel_groupby(uservisits.sourceIP.str.slice(0, 7))
    .adRevenue
    .sum()
)
query_2

In [None]:
(rankings
    .merge(uservisits, left_on="pageURL", right_on="destinationURL", how="left")
    .query("visitDate > '1980-01-01' and visitDate < '1980-04-01'")
)

In [None]:
query_3 = (
rankings
    .merge(uservisits, left_on="pageURL", right_on="destinationURL", how="left")
    .query("visitDate > '1980-01-01' and visitDate < '1980-04-01'")
    .groupby("sourceIP")
    .agg({"pageRank": "mean", "adRevenue": "sum"})
    .sort_values("adRevenue", ascending=False)
    .head(10)
)
query_3

In [None]:
crawl = pd.concat(
    (pd.read_table(
        path,
        sep="dasdsadsa",
        header=None,
        engine="python",
        names=["pageSource"],
    )
    for path in glob.glob("data/AMPLab/tiny/crawl/*.csv")),
    axis=0,
    ignore_index=True, 
)

In [None]:
import re
url_regex = re.compile("(?P<url>https?://[^\s]+)")

def extract_url(line):
    match = url_regex.search(line)
    if match:
        return match.group("url")
    else:
        return None
crawl["destPage"] = crawl.pageSource.apply(extract_url)

In [None]:
(crawl
    .pageSource
    .parallel_apply(extract_url)
    .value_counts()
) 

In [None]:
(crawl
    .groupby("destPage", as_index=False)
    .size()
    .sort_values("size", ascending=False)
)