In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("data/train_users.csv")

In [11]:
def split_date_column(df, column_name, date_format=None):
    if date_format:
        df[column_name] = pd.to_datetime(df[column_name], format=date_format)
    else:
        df[column_name] = pd.to_datetime(df[column_name])

    col_index = df.columns.get_loc(column_name)
    df.insert(col_index + 1, f"{column_name}_year", df[column_name].dt.year)
    df.insert(col_index + 2, f"{column_name}_month", df[column_name].dt.month)
    df.insert(col_index + 3, f"{column_name}_day", df[column_name].dt.day)
    df.drop(columns=[column_name], inplace=True)
    return df

In [12]:
df = split_date_column(df, "date_account_created")
df = split_date_column(df, "timestamp_first_active", date_format="%Y%m%d%H%M%S")
df = split_date_column(df, "date_first_booking")

In [13]:
bins = list(np.arange(0, 120, 5)) + [np.int64(120), np.int64(200)]
labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]

age_temp = np.where(df["age"] >= 1915, 2015 - df["age"], df["age"])
df["age"] = np.where((age_temp < 13) | (age_temp > 120), np.nan, age_temp)

df["age"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

In [14]:
countries = pd.read_csv("data/countries.csv")
countries

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [15]:
df = pd.merge(
    df,
    countries[
        [
            "country_destination",
            "distance_km",
            "destination_km2",
            "language_levenshtein_distance",
        ]
    ].rename(
        columns={
            "distance_km": "destination_distance_km",
            "destination_km2": "destination_area",
            "language_levenshtein_distance": "destination_language_levenshtein_distance",
        }
    ),
    on="country_destination",
    how="left",
)

columns = [col for col in df.columns if col != "country_destination"] + [
    "country_destination"
]
df = df[columns]

In [16]:
# 将列: 'age' 的列类型更改为 category
df = df.astype(
    {
        "gender": "category",
        "signup_method": "category",
        "signup_flow": "category",
        "language": "category",
        "affiliate_channel": "category",
        "affiliate_provider": "category",
        "first_affiliate_tracked": "category",
        "signup_app": "category",
        "first_device_type": "category",
        "first_browser": "category",
        "country_destination": "category",
    }
)

In [17]:
df

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,date_first_booking_year,date_first_booking_month,date_first_booking_day,...,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,destination_distance_km,destination_area,destination_language_levenshtein_distance,country_destination
0,gxn3p5htnn,2010,6,28,2009,3,19,,,,...,direct,direct,untracked,Web,Mac Desktop,Chrome,,,,NDF
1,820tgsjxq7,2011,5,25,2009,5,23,,,,...,seo,google,untracked,Web,Mac Desktop,Chrome,,,,NDF
2,4ft3gnwmtx,2010,9,28,2009,6,9,2010.0,8.0,2.0,...,direct,direct,untracked,Web,Windows Desktop,IE,0.0,9826675.0,0.0,US
3,bjjt8pjhuk,2011,12,5,2009,10,31,2012.0,9.0,8.0,...,direct,direct,untracked,Web,Mac Desktop,Firefox,,,,other
4,87mebub9p4,2010,9,14,2009,12,8,2010.0,2.0,18.0,...,direct,direct,untracked,Web,Mac Desktop,Chrome,0.0,9826675.0,0.0,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014,6,30,2014,6,30,,,,...,sem-brand,google,omg,Web,Mac Desktop,Safari,,,,NDF
213447,mhewnxesx9,2014,6,30,2014,6,30,,,,...,direct,direct,linked,Web,Windows Desktop,Chrome,,,,NDF
213448,6o3arsjbb4,2014,6,30,2014,6,30,,,,...,direct,direct,untracked,Web,Mac Desktop,Firefox,,,,NDF
213449,jh95kwisub,2014,6,30,2014,6,30,,,,...,other,other,tracked-other,iOS,iPhone,Mobile Safari,,,,NDF
