In [1]:
import numpy as np
import pandas as pd

In [2]:
train_set = pd.read_csv("data/train_users.csv")
test_set = pd.read_csv("data/test_users.csv")
test_set["country_destination"] = "NDF"
df = pd.concat([train_set, test_set], axis=0)

In [3]:
def replace_with_nan(df, column_name, nan):
    df[column_name] = df[column_name].replace(nan, np.nan)


def split_date_column(df, column_name, date_format=None):
    if date_format:
        df[column_name] = pd.to_datetime(df[column_name], format=date_format)
    else:
        df[column_name] = pd.to_datetime(df[column_name])

    col_index = df.columns.get_loc(column_name)
    df.insert(col_index + 1, f"{column_name}_year", df[column_name].dt.year)
    df.insert(col_index + 2, f"{column_name}_month", df[column_name].dt.month)
    df.insert(col_index + 3, f"{column_name}_day", df[column_name].dt.day)
    df.drop(columns=[column_name], inplace=True)
    return df


def count_nonzeros(column):
    column = column.fillna(0)
    column = np.where(column > 1, 1, column)
    return np.sum(column)

In [4]:
replace_with_nan(df, "gender", "-unknown-")
replace_with_nan(df, "first_affiliate_tracked", "untracked")
replace_with_nan(df, "first_device_type", "Other/Unknown")
replace_with_nan(df, "first_browser", "-unknown-")

In [5]:
df = split_date_column(df, "date_account_created")
df = split_date_column(df, "timestamp_first_active", date_format="%Y%m%d%H%M%S")
# df = split_date_column(df, "date_first_booking")
df.drop(columns=["date_first_booking"], inplace=True)

In [6]:
# bins = list(np.arange(0, 120, 5)) + [np.int64(120), np.int64(200)]
# labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]

# Maybe took age by year of birth by mistake
age_temp = np.where(df["age"] >= 1915, 2015 - df["age"], df["age"])
# Kick out brats and undeads
age_temp = np.where((age_temp < 13) | (age_temp > 120), np.nan, age_temp)

# df["age"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
df = df.rename(columns={"age": "age_group"})
df["age_group"] = age_temp // 5

In [7]:
countries = pd.read_csv("data/countries.csv")
countries

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [8]:
df = pd.merge(
    df,
    countries[
        [
            "country_destination",
            "distance_km",
            "destination_km2",
            "language_levenshtein_distance",
        ]
    ].rename(
        columns={
            "distance_km": "destination_distance_km",
            "destination_km2": "destination_area",
            "language_levenshtein_distance": "destination_language_levenshtein_distance",
        }
    ),
    on="country_destination",
    how="left",
)

columns = [col for col in df.columns if col != "country_destination"] + [
    "country_destination"
]
df = df[columns]

In [9]:
df = df.astype(
    {
        "gender": "category",
        "signup_method": "category",
        "signup_flow": "category",
        "language": "category",
        "affiliate_channel": "category",
        "affiliate_provider": "category",
        "first_affiliate_tracked": "category",
        "signup_app": "category",
        "first_device_type": "category",
        "first_browser": "category",
        "country_destination": "category",
    }
)

In [10]:
df

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,gender,age_group,signup_method,...,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,destination_distance_km,destination_area,destination_language_levenshtein_distance,country_destination
0,gxn3p5htnn,2010,6,28,2009,3,19,,,facebook,...,direct,direct,,Web,Mac Desktop,Chrome,,,,NDF
1,820tgsjxq7,2011,5,25,2009,5,23,MALE,7.0,facebook,...,seo,google,,Web,Mac Desktop,Chrome,,,,NDF
2,4ft3gnwmtx,2010,9,28,2009,6,9,FEMALE,11.0,basic,...,direct,direct,,Web,Windows Desktop,IE,0.0,9826675.0,0.0,US
3,bjjt8pjhuk,2011,12,5,2009,10,31,FEMALE,8.0,facebook,...,direct,direct,,Web,Mac Desktop,Firefox,,,,other
4,87mebub9p4,2010,9,14,2009,12,8,,8.0,basic,...,direct,direct,,Web,Mac Desktop,Chrome,0.0,9826675.0,0.0,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,2014,9,30,,6.0,basic,...,direct,direct,,Web,Windows Desktop,IE,,,,NDF
275543,zp8xfonng8,2014,9,30,2014,9,30,,,basic,...,direct,direct,,Android,Android Phone,,,,,NDF
275544,fa6260ziny,2014,9,30,2014,9,30,,,basic,...,direct,direct,linked,Web,Windows Desktop,Firefox,,,,NDF
275545,87k0fy4ugm,2014,9,30,2014,9,30,,,basic,...,sem-brand,google,omg,Web,Mac Desktop,Safari,,,,NDF


In [11]:
df = pd.get_dummies(
    df,
    columns=[
        col
        for col in df.columns
        if (col not in ["id", "country_destination"])
        and (col in df.select_dtypes(include=["category"]).columns)
    ],
    drop_first=False,
    dummy_na=True,
)

In [12]:
session = pd.read_feather("data/preprocessed/sessions_grouped")
session

Unnamed: 0,user_id,count,action_10,action_11,action_12,action_15,action_BLANK,action_about_us,action_accept_decline,action_account,...,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch
0,00023iyk9l,40,0,0,0,0,1,0,0,0,...,0,0,36,0,0,0,0,0,4,0
1,0010k6l0om,63,0,0,0,0,0,0,0,0,...,0,0,63,0,0,0,0,0,0,0
2,001wyh0pz8,90,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0028jgx1x1,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002qnbzfs5,789,9,0,0,0,7,0,0,0,...,0,0,0,0,0,0,0,0,775,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135478,zzxox7jnrx,89,0,0,0,0,0,0,0,0,...,0,0,0,0,0,89,0,0,0,0
135479,zzy7t0y9cm,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,8,0,0,0,0
135480,zzysuoqg6x,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
135481,zzywmcn0jv,51,0,0,0,0,2,0,0,0,...,0,0,0,0,0,51,0,0,0,0


In [13]:
cols_for_booking = [
    "action_apply_coupon_click_success",
    "action_apply_reservation",
    "action_booking",
    "action_change_availability",
    "action_change_currency",
    "action_coupon_code_click",
    "action_pay",
    "action_payment_methods",
    "action_print_confirmation",
    "action_rate",
    "action_receipt",
    "action_recent_reservations",
    "action_detail_apply_coupon",
    "action_detail_apply_coupon_click",
    "action_detail_apply_coupon_click_success",
    "action_detail_apply_coupon_error",
    "action_detail_booking",
    "action_detail_book_it",
    "action_detail_change_availability",
    "action_detail_change_or_alter",
    "action_detail_create_payment_instrument",
    "action_detail_modify_reservations",
]

cols_for_non_english = [
    "action_ajax_google_translate",
    "action_ajax_google_translate_description",
    "action_ajax_google_translate_reviews",
    "action_change_currency",
    "action_country_options",
    "action_south-america",
    "action_southern-europe",
    "action_spoken_languages",
    "action_detail_translate_listing_reviews",
    "action_detail_translations",
    "action_languages_multiselect",
    "action_spoken_languages",
    "action_detail_user_languages",
]

session["booking"] = session[cols_for_booking].sum(axis=1, skipna=True)
session["not_so_english"] = session[cols_for_non_english].sum(axis=1, skipna=True)

In [14]:
session

Unnamed: 0,user_id,count,action_10,action_11,action_12,action_15,action_BLANK,action_about_us,action_accept_decline,action_account,...,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,booking,not_so_english
0,00023iyk9l,40,0,0,0,0,1,0,0,0,...,36,0,0,0,0,0,4,0,0,0
1,0010k6l0om,63,0,0,0,0,0,0,0,0,...,63,0,0,0,0,0,0,0,0,0
2,001wyh0pz8,90,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0028jgx1x1,31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002qnbzfs5,789,9,0,0,0,7,0,0,0,...,0,0,0,0,0,0,775,0,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135478,zzxox7jnrx,89,0,0,0,0,0,0,0,0,...,0,0,0,89,0,0,0,0,0,0
135479,zzy7t0y9cm,8,0,0,0,0,0,0,0,0,...,0,0,0,8,0,0,0,0,0,0
135480,zzysuoqg6x,3,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
135481,zzywmcn0jv,51,0,0,0,0,2,0,0,0,...,0,0,0,51,0,0,0,0,0,0


In [15]:
df = df.merge(session, how="left", left_on="id", right_on="user_id")
df.drop(columns=["user_id"], inplace=True)

In [16]:
nonzero_counts = df.drop(columns=["id", "country_destination"]).apply(
    count_nonzeros, axis=0
)
low_frequency_cols = nonzero_counts[nonzero_counts <= 10].index
print("Columns to remove:", len(low_frequency_cols))

Columns to remove: 142


In [17]:
df = df.drop(columns=low_frequency_cols)
for col in df.columns:
    if col not in ["country_destination"]:
        df[col].fillna(-42424242)
df

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,age_group,destination_distance_km,destination_area,...,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,booking,not_so_english
0,gxn3p5htnn,2010,6,28,2009,3,19,,,,...,,,,,,,,,,
1,820tgsjxq7,2011,5,25,2009,5,23,7.0,,,...,,,,,,,,,,
2,4ft3gnwmtx,2010,9,28,2009,6,9,11.0,0.0,9826675.0,...,,,,,,,,,,
3,bjjt8pjhuk,2011,12,5,2009,10,31,8.0,,,...,,,,,,,,,,
4,87mebub9p4,2010,9,14,2009,12,8,8.0,0.0,9826675.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,2014,9,30,6.0,,,...,0.0,0.0,0.0,89.0,0.0,0.0,4.0,0.0,0.0,1.0
275543,zp8xfonng8,2014,9,30,2014,9,30,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275544,fa6260ziny,2014,9,30,2014,9,30,,,,...,0.0,0.0,0.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0
275545,87k0fy4ugm,2014,9,30,2014,9,30,,,,...,0.0,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [18]:
test_idx = df["id"].isin(test_set["id"])
train_data = df[~test_idx]
test_data = df[test_idx].drop(columns=["country_destination"])

train_data.to_feather("data/preprocessed/train_data")
test_data.to_feather("data/preprocessed/test_data")