In [1]:
import warnings

warnings.filterwarnings("ignore")
import os
import pandas as pd
import janitor
import sidetable

pd.set_option("display.max_columns", 100)

from utils.data_utils import pandas_to_tex

FIGSAVEDIR = "../figs"
DATAPATH = "../adult-data"

In [2]:
# (fold cell) Load and prep web browsing data
# output = df with visits to adult sites
usecols_web_desktop = [
    "caseid",
    "category",
    "private_domain",
    "visit_duration",
    "visit_time_local",
]
usecols = [
    "caseid",
    "category",
    "private_domain",
    "page_duration",
    "session_start_time",
]

df = pd.concat(
    [
        # Get web_mobile
        pd.read_csv(
            os.path.join(
                DATAPATH, "output/realityMine_web_mobile_2022-06-01_2022-06-30.csv"
            ),
            usecols=usecols_web_desktop,
            low_memory=False,
        ),
        # Get web_desktop
        pd.read_csv(
            os.path.join(
                DATAPATH, "output/realityMine_web_desktop_2022-06-01_2022-06-30.csv"
            ),
            usecols=usecols_web_desktop,
            low_memory=False,
        ),
        # Get web
        (
            pd.read_csv(
                os.path.join(
                    DATAPATH, "output/realityMine_web_2022-06-01_2022-06-30.csv"
                ),
                usecols=usecols,
                low_memory=False,
            )
            # Renaming columns to be consistent w/ web_mobile & web_desktop
            .rename_column("session_start_time", "visit_time_local").rename_column(
                "page_duration", "visit_duration"
            )
        ),
    ]
)
print("output = df with web browses")
print(f"{len(df)=:,}")
df.head()

output = df with web browses
len(df)=6,297,382


Unnamed: 0,caseid,private_domain,category,visit_time_local,visit_duration
0,205323077,google.com,Search Engines and Portals,2022-05-31 23:52:37,2
1,205323077,coupons.com,"Business, Shopping",2022-06-01 01:07:35,457
2,205323077,google.com,Business,2022-06-01 01:15:12,55
3,205323077,coupons.com,"Business, Shopping",2022-06-01 01:16:07,2225
4,205323077,google.com,Search Engines and Portals,2022-06-01 04:38:10,10


## Most common categories

In [3]:
df_top10cat = df.stb.freq(["category"]).head(10)
top10cat = df_top10cat["category"].tolist()
df_top10cat

Unnamed: 0,category,count,percent,cumulative_count,cumulative_percent
0,Business,720930,12.958792,720930,12.958792
1,Search Engines and Portals,670710,12.056082,1391640,25.014874
2,Chat and Instant Messaging,613394,11.025821,2005034,36.040696
3,"Business, Social Networking",443281,7.968022,2448315,44.008718
4,Shopping,328300,5.901227,2776615,49.909945
5,"Business, Information Technology",284428,5.112623,3061043,55.022568
6,"Entertainment, Streaming Media",261165,4.694468,3322208,59.717036
7,"News and Media, Search Engines and Portals",231157,4.155071,3553365,63.872107
8,"Business, Education",181106,3.255399,3734471,67.127506
9,"Business, Shopping",135431,2.434386,3869902,69.561893


## Get examples of top 10

In [4]:
sites_column = []
for cat in top10cat:
    #     print(cat)
    _df = (
        df.query(f"category=='{cat}'")
        .groupby("private_domain")
        .size()
        .reset_index()
        .sort_values(0, ascending=False)
        .head(3)
    )
    sites = ", ".join(_df["private_domain"].tolist())
    sites = "(" + sites + ")"
    sites_column.append(sites)
#     display(_df)

In [5]:
df_top10cat["examples"] = sites_column
df_top10cat

Unnamed: 0,category,count,percent,cumulative_count,cumulative_percent,examples
0,Business,720930,12.958792,720930,12.958792,"(decipherinc.com, samplicio.us, privatelink.de)"
1,Search Engines and Portals,670710,12.056082,1391640,25.014874,"(google.com, google.co.uk, yahoo.com)"
2,Chat and Instant Messaging,613394,11.025821,2005034,36.040696,"(google.com, yahoo.com, live.com)"
3,"Business, Social Networking",443281,7.968022,2448315,44.008718,"(facebook.com, facebook.co, soocial.com)"
4,Shopping,328300,5.901227,2776615,49.909945,"(amazon.com, ebay.com, walmart.com)"
5,"Business, Information Technology",284428,5.112623,3061043,55.022568,"(clarity.ms, sentry.io, inboxdollars.com)"
6,"Entertainment, Streaming Media",261165,4.694468,3322208,59.717036,"(youtube.com, hulu.com, netflix.com)"
7,"News and Media, Search Engines and Portals",231157,4.155071,3553365,63.872107,"(bing.com, att.net)"
8,"Business, Education",181106,3.255399,3734471,67.127506,"(yougov.com, google.com, prolific.co)"
9,"Business, Shopping",135431,2.434386,3869902,69.561893,"(amazon.com, rakuten.com, instacart.com)"


## Duration on sites

In [6]:
df_ind = (
    df.groupby(["caseid", "category"])["visit_duration"]
    .sum().reset_index()
    # ==================================================
    # Imputing zeroes if ind. never visited category x
    .pivot_table(index='caseid', columns="category", values="visit_duration", fill_value=0)
    .stack()
    .reset_index(name='visit_duration')
    .rename(columns={'level_1': 'category'})
)
print(len(df_ind))
df_ind.head(3)

439992


Unnamed: 0,caseid,category,visit_duration
0,47541,"Abortion, Education",0
1,47541,Adult,0
2,47541,"Adult, Business",0


In [7]:
df_describe = pd.DataFrame()
ntiles = [0.25, 0.5, 0.75, 0.9, 0.95]
for cat in top10cat:
    #     print(cat)
    # Get cat
    _df = (
        df_ind.query(f"category=='{cat}'")
        .assign(visit_hours=lambda df_: df_["visit_duration"] / 3600)
        .select_columns("visit_hours")
    )
    # Describe cat
    _df = (
        _df.describe(percentiles=ntiles)
        .T.reset_index(drop=True)
        .assign(category=cat)
        .remove_columns(["count"])
    )
    df_describe = pd.concat([df_describe, _df])
#     _df = _df["visit_hours"].describe()

df_describe

Unnamed: 0,mean,std,min,25%,50%,75%,90%,95%,max,category
0,4.628415,9.864044,0.0,0.415,1.611944,4.782014,11.336389,18.575917,140.503333,Business
0,3.970651,8.742947,0.0,0.172292,1.275278,4.479931,10.023722,15.177278,139.885556,Search Engines and Portals
0,5.903662,21.319802,0.0,0.0,0.205833,5.333056,16.426972,25.121514,492.304444,Chat and Instant Messaging
0,5.027452,16.644659,0.0,0.000556,0.200833,2.540833,11.395222,23.155208,245.533333,"Business, Social Networking"
0,3.22868,6.970937,0.0,0.094583,0.799167,3.185764,8.167917,14.433125,78.976389,Shopping
0,1.674523,4.667188,0.0,0.060903,0.426528,1.598125,3.888722,6.420347,87.757778,"Business, Information Technology"
0,5.461475,28.728515,0.0,0.0,0.134444,1.380556,9.631389,20.468764,617.479167,"Entertainment, Streaming Media"
0,1.297584,4.296336,0.0,0.0,0.0,0.182083,3.885167,6.948514,61.792778,"News and Media, Search Engines and Portals"
0,1.69042,4.710243,0.0,0.123056,0.577222,1.732361,3.544222,6.277292,104.547778,"Business, Education"
0,1.231626,2.970126,0.0,0.031806,0.301667,1.061944,2.996861,5.101806,32.187222,"Business, Shopping"


## Combine

In [8]:
df_top10cat = (
    df_top10cat.merge(df_describe, how="left", on="category", validate="1:1")
    .round(1)
    .assign(
        **{
            k: lambda df_, col=k: df_[col].apply(lambda x: "{:,}".format(x))
            for k in ["count", "cumulative_count"]
        }
    )
)
pandas_to_tex(df_top10cat, "../tabs/top10cat_description")
df_top10cat

Unnamed: 0,category,count,percent,cumulative_count,cumulative_percent,examples,mean,std,min,25%,50%,75%,90%,95%,max
0,Business,720930,13.0,720930,13.0,"(decipherinc.com, samplicio.us, privatelink.de)",4.6,9.9,0.0,0.4,1.6,4.8,11.3,18.6,140.5
1,Search Engines and Portals,670710,12.1,1391640,25.0,"(google.com, google.co.uk, yahoo.com)",4.0,8.7,0.0,0.2,1.3,4.5,10.0,15.2,139.9
2,Chat and Instant Messaging,613394,11.0,2005034,36.0,"(google.com, yahoo.com, live.com)",5.9,21.3,0.0,0.0,0.2,5.3,16.4,25.1,492.3
3,"Business, Social Networking",443281,8.0,2448315,44.0,"(facebook.com, facebook.co, soocial.com)",5.0,16.6,0.0,0.0,0.2,2.5,11.4,23.2,245.5
4,Shopping,328300,5.9,2776615,49.9,"(amazon.com, ebay.com, walmart.com)",3.2,7.0,0.0,0.1,0.8,3.2,8.2,14.4,79.0
5,"Business, Information Technology",284428,5.1,3061043,55.0,"(clarity.ms, sentry.io, inboxdollars.com)",1.7,4.7,0.0,0.1,0.4,1.6,3.9,6.4,87.8
6,"Entertainment, Streaming Media",261165,4.7,3322208,59.7,"(youtube.com, hulu.com, netflix.com)",5.5,28.7,0.0,0.0,0.1,1.4,9.6,20.5,617.5
7,"News and Media, Search Engines and Portals",231157,4.2,3553365,63.9,"(bing.com, att.net)",1.3,4.3,0.0,0.0,0.0,0.2,3.9,6.9,61.8
8,"Business, Education",181106,3.3,3734471,67.1,"(yougov.com, google.com, prolific.co)",1.7,4.7,0.0,0.1,0.6,1.7,3.5,6.3,104.5
9,"Business, Shopping",135431,2.4,3869902,69.6,"(amazon.com, rakuten.com, instacart.com)",1.2,3.0,0.0,0.0,0.3,1.1,3.0,5.1,32.2
