In [1]:
import numpy as np
import pandas as pd
import os
import pickle

from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
basepath = '/content/drive/My Drive/AML - AIRBNB/'
datapath = os.path.join(basepath, 'airbnb-recruiting-new-user-bookings')

train_users_path = os.path.join(datapath, 'train_users_2.csv/train_users_2.csv')
sessions_path = os.path.join(datapath, 'sessions.csv/sessions.csv')

# SESSION DATA MANIPULATION

In [0]:
#sessions
df_sessions = pd.read_csv(sessions_path, na_values='-unknown-')
df_sessions = df_sessions[~df_sessions.user_id.isna()]
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)

#session-secs
dgr_sess = df_sessions.groupby(['id'], sort=False)
df_sessions["secs_elapsed"] = dgr_sess["secs_elapsed"].transform(lambda x: x.fillna(x.mean()))
df_sessions["secs_elapsed"] = df_sessions["secs_elapsed"].fillna(df_sessions["secs_elapsed"].mean())

In [0]:
df_sessions.action = df_sessions.action.fillna('NA')
df_sessions.action_type = df_sessions.action_type.fillna('NA')
df_sessions.action_detail = df_sessions.action_detail.fillna('NA')
df_sessions.device_type = df_sessions.device_type.fillna('NA')

#Action values with low frequency are changed to 'OTHER'
act_freq = 100  #Threshold for frequency
df_sessions.action.where(df_sessions.groupby('action')['action'].transform('size') > act_freq, 'OTHER', inplace=True)

df_sessions.loc[df_sessions.secs_elapsed < 0, "secs_elapsed"] = np.nan

def marginal_stats(feature):
  piv = df_sessions.pivot_table(index="id", columns=feature, 
                                aggfunc='size', fill_value=0)
  piv = piv.add_prefix(f"{feature}_")
  piv = piv.add_suffix("_count")
  piv[f"{feature}_std"] = piv.std(axis=1)
  piv[f"{feature}_total_count"] = piv.gt(0).sum(axis=1)
  return piv

action_val_piv = marginal_stats("action")
action_detail_piv = marginal_stats("action_detail")
action_type_piv = marginal_stats("action_type")
device_type_piv = marginal_stats("device_type")

In [0]:
dgr_sess = df_sessions.groupby(['id'], sort=False)
secs_df = dgr_sess['secs_elapsed'].agg(secs_sum=lambda x: np.log1p(np.sum(x)),
                                      secs_mean=lambda x: np.log1p(np.mean(x)),
                                      secs_std=lambda x: np.log1p(np.std(x)),
                                      secs_median=lambda x: np.log1p(np.median(x)),
                                      secs_day_breaks=lambda x: np.sum(x > 86400),
                                      secs_long_breaks=lambda x: np.sum(x > 300000),
                                      secs_short_breaks=lambda x: np.sum(x < 3600)
                                      )
df_sessions["secs_bin"] = pd.cut(np.log1p(df_sessions["secs_elapsed"]), 10, 
                                labels=False).astype(int)
secs_bins = df_sessions[["id", "secs_bin"]]
secs_piv = secs_bins.pivot_table(index="id", columns='secs_bin', 
                              aggfunc='sum', fill_value=0)
secs_piv = secs_piv.add_prefix("secs_bin_").add_suffix("_count")

In [0]:
session_final = pd.concat([
                          action_val_piv,
                          action_detail_piv,
                          action_type_piv,
                          device_type_piv,
                          secs_df,
                          secs_piv
                          ], axis=1,
                          sort=False
                        )

# USERS DATA MANIPULATION

In [0]:
df_train = pd.read_csv(train_users_path, parse_dates=['date_account_created',
                                                    'timestamp_first_active'])
df_train.set_index("id", inplace=True)
target = df_train.pop('country_destination')

test_data = pd.read_csv(os.path.join(datapath, 'test_users.csv/test_users.csv'),
                        parse_dates=['date_account_created', 
                                     'timestamp_first_active']
                        )
test_data.set_index('id', inplace=True)

In [0]:
train_idx = len(df_train)

train_test = df_train.append(test_data, sort=False)

train_test = train_test.drop(['date_first_booking'], axis=1)

train_test['nulls'] = train_test.isna().sum(axis=1)

train_test["dac_epochs"] = (train_test.date_account_created - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')
train_test["day_of_creation"] = train_test.date_account_created.dt.dayofweek
train_test["month_of_creation"] = train_test.date_account_created.dt.month
train_test_wd = pd.get_dummies(train_test.day_of_creation, prefix='dac_wd_', drop_first=True)
train_test_m = pd.get_dummies(train_test.month_of_creation, prefix='dac_m_', drop_first=True)
train_test = train_test.drop(['date_account_created', 'day_of_creation', 'month_of_creation'], axis=1)
train_test = pd.concat((train_test, train_test_wd, train_test_m), axis=1,
                       sort=False)

train_test["tfa_epochs"] = (train_test.timestamp_first_active - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')
train_test["day_first_active"] = train_test.timestamp_first_active.dt.dayofweek
train_test["month_first_active"] = train_test.timestamp_first_active.dt.month
train_test["hour_first_active"] = train_test.timestamp_first_active.dt.hour
train_test_tfa_wd = pd.get_dummies(train_test.day_first_active, prefix='tfa_wd_', drop_first=True)
train_test_tfa_m = pd.get_dummies(train_test.month_first_active, prefix='tfa_m_', drop_first=True)
train_test_tfa_h = pd.get_dummies(train_test.hour_first_active, prefix='tfa_h_', drop_first=True)
train_test = train_test.drop(['timestamp_first_active', 'day_first_active', 
                    'month_first_active', 'hour_first_active'],
                   axis=1)
train_test = pd.concat([train_test, train_test_tfa_wd, train_test_tfa_m, train_test_tfa_h], 
                       axis=1,
                       sort=False)

train_test['dac_tfa_msecs'] = train_test.dac_epochs - train_test.tfa_epochs
train_test['sig_dac_tfa'] = np.sign(train_test.dac_tfa_msecs)



train_test.loc[(train_test.age > 1900) & (train_test.age < 2010),'age'] = 2014 - train_test.loc[(train_test.age > 1900) & (train_test.age < 2010),'age']
train_test.loc[train_test.age <= 0, 'age'] = np.nan
train_test.loc[train_test.age < 14, 'age'] = 14
train_test.loc[(train_test.age < 2019) & (train_test.age > 2010), 'age'] = np.nan
train_test.loc[train_test.age > 99, 'age'] = 99
train_test["age"] = train_test.age.fillna(train_test.age.mean())


train_test['age_interv'] = pd.cut(train_test.age, bins=20, labels=False).astype(int)
train_test_ai = pd.get_dummies(train_test.age_interv, prefix='age_interv_', drop_first=True)
train_test = train_test.drop(['age_interv'], axis=1)
train_test = pd.concat((train_test, train_test_ai), axis=1, sort=False)

In [0]:
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 
             'first_affiliate_tracked', 'signup_app', 
             'first_device_type', 'first_browser']

for feature in ohe_feats:
    train_test_dummy = pd.get_dummies(train_test[feature], prefix=f"{feature}_", 
                                 drop_first=True)
    train_test = train_test.drop([feature], axis=1)
    train_test = pd.concat((train_test, train_test_dummy), axis=1,
                           sort=False)   




# MERGING AND TRAIN-TEST-VALIDATION SPLIT

In [0]:
df_all = pd.merge(train_test, session_final, how='left', 
                  left_index=True, right_index=True)

In [0]:
df_all['has_session'] = (df_all.isna().sum(axis=1) == 0).astype(int)
df_all = df_all.loc[:, (df_all != df_all.iloc[0]).any()] 
df_all = df_all.fillna(-1)  #Missing features for samples without sesssion data.

In [0]:
df_train = df_all.iloc[:train_idx]
df_test = df_all.iloc[train_idx:]

In [0]:
#####Computing X, y and X_test ################
le = LabelEncoder()

y = le.fit_transform(target.values)

X_train, X_val, y_train, y_val = train_test_split(df_train, y, test_size = 0.25)


# OVERSAMPLING + UNDERSAMPLING

In [0]:
cat_strings = ohe_feats + ['age_interv', 'tfa_', 'dac_', 'booked', 'has_session']
cat_features = [idx for idx, x in enumerate(X_train.columns) for string in cat_strings \
                if string in x and 'action' not in x and 'epochs' not in x and 'msecs' not in x]

In [0]:
def get_sample_size(y):
  classes, counts = np.unique(y, return_counts=True)
  samplesize = [x if x >= max(counts) // 3 else max(counts) // 3 for x in counts]
  return dict(zip(classes, samplesize))

In [0]:
resampler = SMOTENC(
    categorical_features=cat_features,
    random_state=42,
    sampling_strategy=get_sample_size
    )

In [0]:
refiner = SMOTEENN(smote=resampler)

In [0]:
X_train_resampled_new, y_train_resampled_new = refiner.fit_resample(X_train,
                                                                    y_train)



In [0]:
X_train_resampled = pd.DataFrame(X_train_resampled_new, columns=X_train.columns)

# SAVING DATA

In [0]:
X_train_resampled.to_pickle(os.path.join(basepath, 'X_train_v2.pkl'))

pd.Series(y_train_resampled_new).to_pickle(os.path.join(basepath, 'y_train_v2.pkl'))

df_test.to_pickle(os.path.join(basepath, 'X_test.pkl'))

X_val.to_pickle(os.path.join(basepath, 'X_val.pkl'))

pd.Series(y_val).to_pickle(os.path.join(basepath, 'y_val.pkl'))