In [1]:
import os
import pandas as pd
import janitor
import numpy as np

pd.set_option("display.max_columns", 100)

DATAPATH = "../adult-data"

In [2]:
# (fold cell) Load and prep web browsing data
# output = df with visits to adult sites
usecols_web_desktop = [
    "caseid",
    "category",
    "private_domain",
    "visit_duration",
    "visit_time_local",
]
usecols = [
    "caseid",
    "category",
    "private_domain",
    "page_duration",
    "session_start_time",
]

df = pd.concat(
    [
        # Get web_mobile
        pd.read_csv(
            os.path.join(
                DATAPATH, "output/realityMine_web_mobile_2022-06-01_2022-06-30.csv"
            ),
            usecols=usecols_web_desktop,
            low_memory=False,
        ),
        # Get web_desktop
        pd.read_csv(
            os.path.join(
                DATAPATH, "output/realityMine_web_desktop_2022-06-01_2022-06-30.csv"
            ),
            usecols=usecols_web_desktop,
            low_memory=False,
        ),
        # Get web
        (
            pd.read_csv(
                os.path.join(
                    DATAPATH, "output/realityMine_web_2022-06-01_2022-06-30.csv"
                ),
                usecols=usecols,
                low_memory=False,
            )
            # Renaming columns to be consistent w/ web_mobile & web_desktop
            .rename_column("session_start_time", "visit_time_local").rename_column(
                "page_duration", "visit_duration"
            )
        ),
    ]
)
print("output = df with web browses")
print(f"{len(df)=:,}")
df.head()

output = df with web browses
len(df)=6,297,382


Unnamed: 0,caseid,private_domain,category,visit_time_local,visit_duration
0,205323077,google.com,Search Engines and Portals,2022-05-31 23:52:37,2
1,205323077,coupons.com,"Business, Shopping",2022-06-01 01:07:35,457
2,205323077,google.com,Business,2022-06-01 01:15:12,55
3,205323077,coupons.com,"Business, Shopping",2022-06-01 01:16:07,2225
4,205323077,google.com,Search Engines and Portals,2022-06-01 04:38:10,10


In [3]:
# (fold cell) Load the 1200 individual metadata
df_ind_metadata = (
    pd.read_csv(os.path.join(DATAPATH, "SOOD0001_OUTPUT.csv"))
    .case_when(
        lambda df: df.pid7 == 1,
        "D",
        lambda df: df.pid7 == 2,
        "D",
        lambda df: df.pid7 == 3,
        "D",
        lambda df: df.pid7 == 4,
        "I",
        lambda df: df.pid7 == 5,
        "R",
        lambda df: df.pid7 == 6,
        "R",
        lambda df: df.pid7 == 7,
        "R",
        np.nan,
        column_name="party",
    )
    .assign(
        caseid=lambda df: df.caseid.apply(int),
        birthyr=lambda df: df.birthyr.apply(int),
    )
)
print(f"{len(df_ind_metadata)=}")
df_ind_metadata.head(3)

len(df_ind_metadata)=1200


  return method(self._obj, *args, **kwargs)


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party
0,200661421,1963,2,1,4,3,3,1,39,2,D
1,200686597,1992,2,6,5,5,8,-1,48,3,
2,200953869,1959,2,1,5,2,7,2,42,1,R


In [4]:
# (fold cell) Retrieve porn sites from piedomains classification
df_vt_categories = (
    pd.read_csv("../data/vt_domain_categories.csv")
    # ===================================================
    # Get forcepoint porn
    .assign(
        forcepoint_porn=lambda df_: (df_["forcepoint_threatseeker"] == "sex")
        | (df_["forcepoint_threatseeker"] == "adult content")
    ).assign(
        forcepoint_adult=lambda df_: df_["forcepoint_threatseeker"] == "adult content"
    )
    # ===================================================
    # Get bitdefender porn
    .assign(bitdefender_porn=lambda df_: df_["bitdefender"] == "porn")
    # ===================================================
    # Get alphamountain_ai porn
    .assign(
        alphamountain_porn=lambda df_: (
            df_["alphamountain_ai"].str.contains("porn", case=False)
        )
        | (df_["alphamountain_ai"].str.contains("adult", case=False))
        | (df_["alphamountain_ai"].str.contains("mature", case=False))
    )
)
df_vt_categories.head()

Unnamed: 0,domain,forcepoint_threatseeker,xcitium_verdict_cloud,bitdefender,alphamountain_ai,sophos,comodo_valkyrie_verdict,dr_web,webroot,trendmicro,prebytes,websense_threatseeker,yandex_safebrowsing,forcepoint_porn,forcepoint_adult,bitdefender_porn,alphamountain_porn
0,007james.com,entertainment,media sharing,entertainment,,,,,,,,,,False,False,False,False
1,0123movie.net,,media sharing,entertainment,"Piracy/Plagiarism, Video/Multimedia",video hosting,,,,,,,,False,False,False,False
2,0410690.com,,,,,,,,,,,,,False,False,False,False
3,042jam.com,media file download,,radiomusic,Entertainment,,media sharing,,,,,,,False,False,False,False
4,042nobs.com,,,,"Audio, Entertainment",,,,,,,,,False,False,False,False


## Forcepoint

In [5]:
pornsites = df_vt_categories.query("forcepoint_porn")["domain"].unique().tolist()
pornsites[:5]

['0dayporno.com',
 '16honeys.com',
 '18cuteteen.com',
 '18exgfs.com',
 '1ashemaletube.com']

In [6]:
# (fold cell) Get individual level total visit durations to adult and non-adult sites
df_ind = (
    df_ind_metadata
    # Get total adult site visits
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get total non-adult site visits
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )  # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )
    # Get total visits
    .assign(
        visits=lambda df: df.visits_adult + df.visits_nonadult,
        prop_adult_visits=lambda df: 100 * (df.visits_adult / df.visits),
    )
    .assign(rep=lambda df: np.where(df.party == "R", 1, 0))
    # Get adult visits duration
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get non-adult visits duration
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) duration is nan, impute as zero
    .assign(
        duration_adult=lambda df: np.where(
            pd.isna(df.duration_adult) & ~pd.isna(df.duration_nonadult),
            0,
            df.duration_adult,
        )
    )
    .assign(
        duration_nonadult=lambda df: np.where(
            ~pd.isna(df.duration_adult) & pd.isna(df.duration_nonadult),
            0,
            df.duration_nonadult,
        )
    )
    # Get total duration
    .assign(
        duration=lambda df: df.duration_adult + df.duration_nonadult,
        prop_adult_duration=lambda df: 100 * (df.duration_adult / df.duration),
    )
    # Define rep dummy
    .case_when(
        lambda df: df.party == "R",
        1,
        lambda df: df.party == "D",
        0,
        np.nan,
        column_name="rep",
    )
    # Define dummy for visiting adultsites
    .case_when(
        lambda df: df.visits_adult > 0,
        1,
        lambda df: df.visits_adult == 0,
        0,
        np.nan,
        column_name="yes_visit_adults",
    )
    # Collapse race into 5 groups
    .case_when(lambda df: df.race >= 5, 5, lambda df: df.race, column_name="race2")
    # Collapse birthyr into cohorts (e.g. 1960s, 1970s,)
    .assign(cohort=lambda df: df.birthyr.round(decimals=-1))
    # Age
    .assign(age=lambda df: 2022 - df.birthyr + 1)
    .assign(age2=lambda df: df.age**2)
    # Collapse educ to 4 groups
    .case_when(
        lambda df: df.educ == 1,
        1,  # did not grad HS
        lambda df: df.educ == 2,
        2,  # HS
        lambda df: df.educ == 3,
        3,  # some college
        lambda df: df.educ >= 4,
        4,  # College grad
        np.nan,
        column_name="educ2",
    )
    .assign(educ2=lambda df: df.educ2.apply(int))
)
assert df_ind.prop_adult_visits.min() >= 0
assert df_ind.prop_adult_visits.max() <= 100
df_ind

  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2
0,200661421,1963,2,1,4,3,3,1,39,2,D,0.0,1728.0,1728.0,0.000000,0.0,0.0,77642.0,77642.0,0.000000,0.0,1,1960,60,3600,4
1,200686597,1992,2,6,5,5,8,-1,48,3,,246.0,3991.0,4237.0,5.805995,,25962.0,234186.0,260148.0,9.979704,1.0,5,1990,31,961,4
2,200953869,1959,2,1,5,2,7,2,42,1,R,0.0,20.0,20.0,0.000000,1.0,0.0,1182.0,1182.0,0.000000,0.0,1,1960,64,4096,4
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,6166.0,23855.0,30021.0,20.538956,1.0,2,1970,57,3249,3
4,201590505,1977,1,4,5,3,3,1,6,4,D,3.0,11083.0,11086.0,0.027061,0.0,78.0,383083.0,383161.0,0.020357,1.0,4,1980,46,2116,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1802885,1935,1,1,6,4,5,2,12,3,R,,,,,1.0,,,,,,1,1940,88,7744,4
1196,1809723,1943,1,1,2,1,1,1,51,3,D,0.0,9805.0,9805.0,0.000000,0.0,0.0,1272613.0,1272613.0,0.000000,0.0,1,1940,80,6400,2
1197,1827351,1947,1,1,5,3,4,1,31,2,I,0.0,543.0,543.0,0.000000,,0.0,23929.0,23929.0,0.000000,0.0,1,1950,76,5776,4
1198,1924249,1980,1,7,4,2,7,2,6,4,R,56.0,21505.0,21561.0,0.259728,1.0,988.0,310982.0,311970.0,0.316697,1.0,5,1980,43,1849,4


In [7]:
df_ind.to_csv("../data/individual_browsing_data_forcepoint.csv", index=False)

## Bitdefenders

In [8]:
pornsites = df_vt_categories.query("bitdefender_porn")["domain"].unique().tolist()
pornsites[:5]

['0dayporno.com', '100x.com', '12thblog.com', '16honeys.com', '18exgfs.com']

In [9]:
# (fold cell) Get individual level total visit durations to adult and non-adult sites
df_ind = (
    df_ind_metadata
    # Get total adult site visits
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get total non-adult site visits
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )  # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )
    # Get total visits
    .assign(
        visits=lambda df: df.visits_adult + df.visits_nonadult,
        prop_adult_visits=lambda df: 100 * (df.visits_adult / df.visits),
    )
    .assign(rep=lambda df: np.where(df.party == "R", 1, 0))
    # Get adult visits duration
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get non-adult visits duration
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) duration is nan, impute as zero
    .assign(
        duration_adult=lambda df: np.where(
            pd.isna(df.duration_adult) & ~pd.isna(df.duration_nonadult),
            0,
            df.duration_adult,
        )
    )
    .assign(
        duration_nonadult=lambda df: np.where(
            ~pd.isna(df.duration_adult) & pd.isna(df.duration_nonadult),
            0,
            df.duration_nonadult,
        )
    )
    # Get total duration
    .assign(
        duration=lambda df: df.duration_adult + df.duration_nonadult,
        prop_adult_duration=lambda df: 100 * (df.duration_adult / df.duration),
    )
    # Define rep dummy
    .case_when(
        lambda df: df.party == "R",
        1,
        lambda df: df.party == "D",
        0,
        np.nan,
        column_name="rep",
    )
    # Define dummy for visiting adultsites
    .case_when(
        lambda df: df.visits_adult > 0,
        1,
        lambda df: df.visits_adult == 0,
        0,
        np.nan,
        column_name="yes_visit_adults",
    )
    # Collapse race into 5 groups
    .case_when(lambda df: df.race >= 5, 5, lambda df: df.race, column_name="race2")
    # Collapse birthyr into cohorts (e.g. 1960s, 1970s,)
    .assign(cohort=lambda df: df.birthyr.round(decimals=-1))
    # Age
    .assign(age=lambda df: 2022 - df.birthyr + 1)
    .assign(age2=lambda df: df.age**2)
    # Collapse educ to 4 groups
    .case_when(
        lambda df: df.educ == 1,
        1,  # did not grad HS
        lambda df: df.educ == 2,
        2,  # HS
        lambda df: df.educ == 3,
        3,  # some college
        lambda df: df.educ >= 4,
        4,  # College grad
        np.nan,
        column_name="educ2",
    )
    .assign(educ2=lambda df: df.educ2.apply(int))
)
assert df_ind.prop_adult_visits.min() >= 0
assert df_ind.prop_adult_visits.max() <= 100
df_ind

  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2
0,200661421,1963,2,1,4,3,3,1,39,2,D,0.0,1728.0,1728.0,0.000000,0.0,0.0,77642.0,77642.0,0.000000,0.0,1,1960,60,3600,4
1,200686597,1992,2,6,5,5,8,-1,48,3,,254.0,3983.0,4237.0,5.994808,,26106.0,234042.0,260148.0,10.035057,1.0,5,1990,31,961,4
2,200953869,1959,2,1,5,2,7,2,42,1,R,0.0,20.0,20.0,0.000000,1.0,0.0,1182.0,1182.0,0.000000,0.0,1,1960,64,4096,4
3,201302005,1966,2,2,3,5,8,1,12,3,,20.0,527.0,547.0,3.656307,,6166.0,23855.0,30021.0,20.538956,1.0,2,1970,57,3249,3
4,201590505,1977,1,4,5,3,3,1,6,4,D,0.0,11086.0,11086.0,0.000000,0.0,0.0,383161.0,383161.0,0.000000,0.0,4,1980,46,2116,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1802885,1935,1,1,6,4,5,2,12,3,R,,,,,1.0,,,,,,1,1940,88,7744,4
1196,1809723,1943,1,1,2,1,1,1,51,3,D,0.0,9805.0,9805.0,0.000000,0.0,0.0,1272613.0,1272613.0,0.000000,0.0,1,1940,80,6400,2
1197,1827351,1947,1,1,5,3,4,1,31,2,I,0.0,543.0,543.0,0.000000,,0.0,23929.0,23929.0,0.000000,0.0,1,1950,76,5776,4
1198,1924249,1980,1,7,4,2,7,2,6,4,R,30.0,21531.0,21561.0,0.139140,1.0,618.0,311352.0,311970.0,0.198096,1.0,5,1980,43,1849,4


In [10]:
df_ind.to_csv("../data/individual_browsing_data_bitdefender.csv", index=False)

## alphaMountain

In [11]:
pornsites = df_vt_categories.query("alphamountain_porn")["domain"].unique().tolist()
pornsites[:5]

['12thblog.com', '2girls1cup.ca', '3gpking.pro', '3movs.com', '4archive.org']

In [12]:
# (fold cell) Get individual level total visit durations to adult and non-adult sites
df_ind = (
    df_ind_metadata
    # Get total adult site visits
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get total non-adult site visits
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")
            .size()
            .reset_index()
            .rename_column(0, "visits_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )  # If adult or non-adult (exclusive) is nan, impute as zero
    .assign(
        visits_adult=lambda df: np.where(
            pd.isna(df.visits_adult) & ~pd.isna(df.visits_nonadult), 0, df.visits_adult
        )
    )
    .assign(
        visits_nonadult=lambda df: np.where(
            ~pd.isna(df.visits_adult) & pd.isna(df.visits_nonadult),
            0,
            df.visits_nonadult,
        )
    )
    # Get total visits
    .assign(
        visits=lambda df: df.visits_adult + df.visits_nonadult,
        prop_adult_visits=lambda df: 100 * (df.visits_adult / df.visits),
    )
    .assign(rep=lambda df: np.where(df.party == "R", 1, 0))
    # Get adult visits duration
    .merge(
        (
            df.query("private_domain in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_adult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # Get non-adult visits duration
    .merge(
        (
            df.query("private_domain not in @pornsites")
            .groupby("caseid")["visit_duration"]
            .sum()
            .reset_index()
            .rename_column("visit_duration", "duration_nonadult")
        ),
        how="left",
        on="caseid",
        validate="1:1",
    )
    # If adult or non-adult (exclusive) duration is nan, impute as zero
    .assign(
        duration_adult=lambda df: np.where(
            pd.isna(df.duration_adult) & ~pd.isna(df.duration_nonadult),
            0,
            df.duration_adult,
        )
    )
    .assign(
        duration_nonadult=lambda df: np.where(
            ~pd.isna(df.duration_adult) & pd.isna(df.duration_nonadult),
            0,
            df.duration_nonadult,
        )
    )
    # Get total duration
    .assign(
        duration=lambda df: df.duration_adult + df.duration_nonadult,
        prop_adult_duration=lambda df: 100 * (df.duration_adult / df.duration),
    )
    # Define rep dummy
    .case_when(
        lambda df: df.party == "R",
        1,
        lambda df: df.party == "D",
        0,
        np.nan,
        column_name="rep",
    )
    # Define dummy for visiting adultsites
    .case_when(
        lambda df: df.visits_adult > 0,
        1,
        lambda df: df.visits_adult == 0,
        0,
        np.nan,
        column_name="yes_visit_adults",
    )
    # Collapse race into 5 groups
    .case_when(lambda df: df.race >= 5, 5, lambda df: df.race, column_name="race2")
    # Collapse birthyr into cohorts (e.g. 1960s, 1970s,)
    .assign(cohort=lambda df: df.birthyr.round(decimals=-1))
    # Age
    .assign(age=lambda df: 2022 - df.birthyr + 1)
    .assign(age2=lambda df: df.age**2)
    # Collapse educ to 4 groups
    .case_when(
        lambda df: df.educ == 1,
        1,  # did not grad HS
        lambda df: df.educ == 2,
        2,  # HS
        lambda df: df.educ == 3,
        3,  # some college
        lambda df: df.educ >= 4,
        4,  # College grad
        np.nan,
        column_name="educ2",
    )
    .assign(educ2=lambda df: df.educ2.apply(int))
)
assert df_ind.prop_adult_visits.min() >= 0
assert df_ind.prop_adult_visits.max() <= 100
df_ind

  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)


Unnamed: 0,caseid,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,party,visits_adult,visits_nonadult,visits,prop_adult_visits,rep,duration_adult,duration_nonadult,duration,prop_adult_duration,yes_visit_adults,race2,cohort,age,age2,educ2
0,200661421,1963,2,1,4,3,3,1,39,2,D,0.0,1728.0,1728.0,0.000000,0.0,0.0,77642.0,77642.0,0.000000,0.0,1,1960,60,3600,4
1,200686597,1992,2,6,5,5,8,-1,48,3,,246.0,3991.0,4237.0,5.805995,,25962.0,234186.0,260148.0,9.979704,1.0,5,1990,31,961,4
2,200953869,1959,2,1,5,2,7,2,42,1,R,0.0,20.0,20.0,0.000000,1.0,0.0,1182.0,1182.0,0.000000,0.0,1,1960,64,4096,4
3,201302005,1966,2,2,3,5,8,1,12,3,,0.0,547.0,547.0,0.000000,,0.0,30021.0,30021.0,0.000000,0.0,2,1970,57,3249,3
4,201590505,1977,1,4,5,3,3,1,6,4,D,3.0,11083.0,11086.0,0.027061,0.0,78.0,383083.0,383161.0,0.020357,1.0,4,1980,46,2116,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1802885,1935,1,1,6,4,5,2,12,3,R,,,,,1.0,,,,,,1,1940,88,7744,4
1196,1809723,1943,1,1,2,1,1,1,51,3,D,0.0,9805.0,9805.0,0.000000,0.0,0.0,1272613.0,1272613.0,0.000000,0.0,1,1940,80,6400,2
1197,1827351,1947,1,1,5,3,4,1,31,2,I,0.0,543.0,543.0,0.000000,,0.0,23929.0,23929.0,0.000000,0.0,1,1950,76,5776,4
1198,1924249,1980,1,7,4,2,7,2,6,4,R,38.0,21523.0,21561.0,0.176244,1.0,760.0,311210.0,311970.0,0.243613,1.0,5,1980,43,1849,4


In [13]:
df_ind.to_csv("../data/individual_browsing_data_alphamountain.csv", index=False)