# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Airbnb New User Bookings dataset. This can be accessed from: https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings/data

## Kaggle Performance Info
Kaggle's scoring for this competition utilizes a Normalized Discounted Cumulative Gain (NDCG) scoring, where up to 5 guesses of destination county (ordered by confidence) are submitted per entry in the test data. A score of 1.0 reflects the first guess being correct, and less points for other scenarios (0.63 for the second guess being correct and so on).

Q1/Q2/Q3 Kaggle scores: 0.85226 / 0.86555 / 0.86661

Notes:

The baseline, which is to always guess NDF-US-OTHER-FR-IT, recieves a NDCG score of 0.8219 (78th Percentile) on the Public Leaderboard. The NDCG score of the dummy model on the training data is 0.80676.

As for scikit's accuracy score, always guessing NDF results in a score of 0.58347 against the training data.

### Usage
Running the second code cell will prompt the user read data from CSV or HDF5 (setting \_\_no_prompt__ will skip the prompt and use HDF5).

### Imports and reading dataset into memory

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import csv

from statistics import mean

from sklearnex import patch_sklearn 
patch_sklearn()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
# from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline # , Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

from scipy.stats import uniform, loguniform

# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingGridSearchCV

from tune_sklearn import TuneGridSearchCV, TuneSearchCV

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier

from sklearn import set_config
set_config(display='diagram')

import xgboost as xgb
# from xgboost import XGBClassifier

import joblib

from joblib import parallel_backend
# from ray.util.joblib import register_ray

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [102]:
%%time
__no_prompt__ = False # If True, skips user input (defaults to HDF).
__no_hdf__ = False # If True, skips prompt and only reads from CSV.

data_folder = Path("airbnb-recruiting-new-user-bookings/") # Replace if using Kaggle Notebook
hdf_path = data_folder / "data.h5"
use_hdf = not __no_hdf__ and (__no_prompt__ or input("Do you want to use an HDF file [Y/n]:") != "n") if hdf_path.exists() else False

if use_hdf: # Read from hdf if availible (much faster)
    df = pd.read_hdf(hdf_path)
else:
    filepath = data_folder / "train_users_2.csv"
    dtypes={'id': 'object', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
    parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']
    cols = list(pd.read_csv(filepath, nrows=1))[1:]
    df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', '<NA>'], parse_dates=parse_dates
                     , infer_datetime_format=True)
    df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking'])
                                              , format='%Y-%m-%d', errors='coerce')

#     df.select_dtypes('datetime64[ns]').fillna(pd.NaT)
    # df = df.set_index('id')
    df['age'] = df['age'].replace(range(2000, 2015), np.nan)
    df['age'] = pd.cut(df['age'], bins = [i*5 for i in range(0, 21)] + [120]
                       , labels=(['%d-%d' % (i*5, i*5+4) for i in range(0, 20)] + ['100+']))
    
#     filepath = data_folder / "age_gender_bkts.csv"
#     dtypes = {}
    
    filepath = data_folder / "sessions.csv"
    dtypes={'user_id': 'string', 'action': 'category', 'action_type': 'category', 'action_detail': 'category', 'device_type': 'category', 'secs_elapsed': 'float64'}
    df_session = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'])
    
    S = df_session.groupby(by='user_id', as_index=False).aggregate(lambda x:x.tolist())
    
    id_list = []
    types_list = []
    for row in S.values:
        id_list.append(row[0])
        types_list.append(np.unique(row[2]))
        
    rows = []
    for user_id, types in zip(id_list, types_list):
        types = np.delete(types, np.where(types == 'nan'))
        row = [user_id]
        for col in df_session['action_type'].cat.categories:
            if col in types:
                row.append(1)
            else:
                row.append(0)
        rows.append(row)
        
    t_cols = ['user_id'] + list(df_session['action_type'].cat.categories)
    t_dtypes = {**{'user_id': 'string'}, **{col: 'category' for col in t_cols[1:]}}
    df_action_types = pd.DataFrame(np.asarray(rows), columns=t_cols).astype(t_dtypes)
    
    df_merge = df.merge(df_action_types, left_on ='id', right_on='user_id').drop(columns=['user_id'])
    df_merge = df_merge.set_index('id')
    
    # Save to hdf if reading from csv
    df.to_hdf(data_folder / "data.h5", key='df', mode='w', format="table")



Wall time: 53.6 s


In [104]:
# df_session.memory_usage(deep=True, index=False).sort_values(ascending=False)

In [None]:
# df.loc(lambda x: df['age'] == 1995)

In [None]:
# df_session['action'].cat.categories

In [None]:
# df_session['action_detail'].cat.categories

In [None]:
# for cat in df_session['action_type'].cat.categories:
#     new_column = pd.Series(name=cat, dtype='int8')
#     df.insert(new_column)

### Diagnostics

In [None]:
print("- - test_users_2.csv - -")
print("Number of lines present: ", len(df_merge))
print("Number of Columns: ", len(df_merge.columns))

In [None]:
topCount = 5
print("Top ", topCount, " dataFrames:")
print(df_merge.head(topCount))

In [39]:
df_merge.memory_usage(deep=True, index=False).sort_values(ascending=False)

user_id                    4945605
date_account_created        590520
timestamp_first_active      590520
date_first_booking          590520
first_browser                79329
language                     75846
age                          75668
affiliate_provider           75521
signup_flow                  75366
country_destination          74827
first_device_type            74737
affiliate_channel            74628
first_affiliate_tracked      74570
signup_app                   74233
signup_method                74113
gender                       74109
booking_request              74039
click                        74039
data                         74039
message_post                 74039
partner_callback             74039
submit                       74039
view                         74039
booking_response             74039
modify                       74039
dtype: int64

In [None]:
df_merge.memory_usage(deep=True, index=False).sum()

### Preprocessing

In [105]:
X = df_merge.drop(columns=['country_destination'])
X = X.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking']) # TODO: Fix datetimes fucking everthing up.
y = df_merge['country_destination']

In [106]:
# This block can be commented out when doing prediction on the Kaggle test.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=0, shuffle=True)

In [107]:
# numeric_transformer = make_pipeline(
# #     SimpleImputer(strategy='mean'),
# #     StandardScaler()
# )

encoder = make_column_transformer(
#     (numeric_transformer, ['age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')),
    remainder='passthrough'
)
encoder

In [108]:
y_enc = LabelEncoder().fit(y_train.values)
y_train = y_enc.transform(y_train)
y_test = y_enc.transform(y_test.values)

In [109]:
# joblib.dump(y_enc, Path("models/LabelEncoder.pkl"))

['models\\LabelEncoder.pkl']

In [55]:
encoded_data = encoder.fit_transform(X)

In [56]:
encoded_data.shape

(73815, 157)

### NDCG Score Implementation

(1, 0, 0, 0, 0) will serve as the true ranking.

The prediction (NDF-US-OTHER-FR-IT) will be used as a dummy model. It will be transformed into an ndarray such that incorrect guesses are transformed to 0 and correct guesses to 1.

A first-rank correct guess generally appears as (1, 0, 0, 0, 0), a second-rank correct guess as (0, 1, 0, 0, 0), and no correct guess as (0, 0, 0, 0, 0).

In [57]:
def ndcg_score(y_true, y_pred, k):
    scores = []
    for true, pred in zip(y_true, y_pred):
        rank = np.argsort(pred)[:-(k+1):-1]
        relevance = np.where(rank == true, 1, 0)
        dcg_score = np.sum(relevance / np.log2(np.arange(2, len(relevance) + 2)))
        scores.append(dcg_score)
    
    return np.mean(scores)

ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [None]:
%%time
# NDCG for the dummy model NDF-US-other-FR-IT as a series of probabilities
y_pred = np.asarray([[0, 0, 0, 0, .0625, 0, 0.03125, .5, 0, 0, .25, .125]]*df.shape[0])

ndcg_score(y_enc.transform(df['country_destination'].values), y_pred, k=5)

### Logistic Regression

In [58]:
clf0 = LogisticRegression(C=0.08, max_iter=500)

pipeline0 = make_pipeline(encoder, clf0)
pipeline0

In [59]:
%%time
# Fit the pipeline on the training data
pipeline0.fit(X_train, y_train)

# Predict probabilities
y_pred = pipeline0.predict_proba(X_test)

Wall time: 1.99 s


In [60]:
ndcg_score(y_test, y_pred, k=5)

0.8492684290374157

In [None]:
%%time
np.mean(cross_val_score(pipeline0, X_train, y_train, scoring=ndcg_scorer))

#### LR with GridSearchCV

In [None]:
# register_ray()

In [None]:
params = {
    'logisticregression__C': [0.1, 1.0, 10.0, 100.0],
    'logisticregression__penalty': ['l2'], # ,l1
    'logisticregression__solver': ['lbfgs', 'liblinear', 'saga']
#     'columntransformer__pipeline__simpleimputer__strategy': ["mean", "median"]
}

grid_search = TuneGridSearchCV(pipeline0, params, n_jobs=-1, verbose=1, cv=5, scoring=ndcg_scorer)
# pipeline0.get_params().keys()

In [None]:
%%time
# with parallel_backend("ray"):
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
to_drop = ['mean_score_time', 'std_score_time', 'std_fit_time', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']
result = pd.DataFrame(grid_search.cv_results_)
result = result.sort_values('mean_test_score', axis=0, ascending=False).drop(columns=to_drop)
result = result.rename(columns=lambda x: x[x.find('__')+1:])
result

In [None]:
y_pred = grid_search.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [64]:
# Pickle model and write to hard drive
joblib.dump(pipeline0, "models/LogisticRegression.pkl")

['models/LogisticRegression.pkl']

### Decision Tree Classifier

In [None]:
clf1 = DecisionTreeClassifier(random_state=0)

pipeline1 = make_pipeline(encoder, clf1)
pipeline1

In [None]:
# Fit the pipeline on the training data
pipeline1.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline1.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline1, "models/CategoricalNB.pkl")

### SVC

In [None]:
clf2 = SVC(probability=True)

pipeline2 = make_pipeline(encoder, clf2)
pipeline2

In [None]:
%%time
# Fit the pipeline on the training data
pipeline2.fit(X_train, y_train)

In [None]:
%%time
# Score the pipeline on the testing data
y_pred = pipeline2.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline2, "models/SVC_1.pkl")

### Neural Network

In [None]:
clf3 = MLPClassifier(random_state=1, max_iter=300)

pipeline3 = make_pipeline(encoder, clf3)
pipeline3

In [None]:
%%time
# Fit the pipeline on the training data
pipeline3.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
y_pred = pipeline3.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline3, "models/MLPClassifier.pkl")

### Random Forest

In [None]:
clf4 = RandomForestClassifier(random_state=0)

pipeline4 = make_pipeline(encoder, clf4)
pipeline4

In [None]:
%%time
# Fit the pipeline on the training data
pipeline4.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
y_pred = pipeline4.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline4, "models/RandomForestClassifier.pkl")

### SGD Classifier

In [None]:
clf5 = SGDClassifier(loss='modified_huber', random_state=0)

pipeline5 = make_pipeline(encoder, clf5)
pipeline5

In [None]:
%%time
# Fit the pipeline on the training data
pipeline5.fit(X_train, y_train)

In [None]:
y_pred = pipeline5.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [None]:
# pipeline5.get_params()

In [None]:
params = {
    # 'sgdclassifier__loss': ['hinge', 'squared_hinge', 'modified_huber', 'perceptron'],
    'sgdclassifier__penalty': ['l2', 'l1', 'elasticnet'],
    'sgdclassifier__alpha': [1e-4, 1e-3, 1e-1],
    'sgdclassifier__epsilon': [0.01, 0.1]
}
grid_search = GridSearchCV(pipeline5, params, n_jobs=-1, verbose=1, cv=5,scoring=ndcg_scorer) #, search_optimization='bayesian')
# pipeline0.get_params().keys()

In [None]:
%%time
# with parallel_backend("ray"):
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
to_drop = ['mean_score_time', 'std_score_time', 'std_fit_time', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']
result = pd.DataFrame(grid_search.cv_results_)
result = result.sort_values('mean_test_score', axis=0, ascending=False).drop(columns=to_drop)
result = result.rename(columns=lambda x: x[x.find('__')+1:])
result

In [None]:
# cross_val_score(grid_search, X_train, y_train)

y_pred = grid_search.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline5, "models/SGDClassifier.pkl")

### XGBoostClassifier

In [61]:
clf6 =xgb.XGBClassifier(objective='mulit:softprob')

pipeline6 = make_pipeline(encoder, clf6)
pipeline6

In [62]:
%%time
# Fit the pipeline on the training data
pipeline6.fit(X_train, y_train)

Wall time: 23.3 s


In [63]:
# Score the pipeline on the testing data
y_pred = pipeline6.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

0.8485256399265694

In [None]:
cross_val_score(pipeline0, X_train, y_train, scoring=ncdg_scorer)

In [None]:
params = {
    "min_child_weight": [1, 5, 10],
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [3, 4, 5],
}

tune_search = TuneSearchCV(
    pipeline6, 
    param_distributions=params, 
    n_trials=3, 
    early_stopping=True, 
    scoring=ndcg_scorer, 
    search_optimization="bayesian"
)

In [None]:
tune_search.fit(X_train, y_train)

In [None]:
print(tune_search.best_params_)
y_pred = tune_search.predict_proba(X_test)
print(ndcg_score(y_test, y_pred, k=5))

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline6, "models/XGBClassifier.pkl")

## Model 7

In [None]:
clf7 = XGBClassifier(objective='mulit:softprob')

pipeline6 = make_pipeline(encoder, clf6)
pipeline6

In [None]:
%%time
# Fit the pipeline on the training data
pipeline6.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
y_pred = pipeline6.predict_proba(X_test)
ndcg_score(y_test, y_pred, k=5)

### K-Means

In [None]:
# clf5 = KMeans(n_clusters=3, max_iter=100, random_state=0)
# clf5.fit(X_train, y_train)

## Predictions for Kaggle's test_users.csv

In [110]:
%%time
__no_prompt__ = False # If True, skips user input (defaults to HDF).
__no_hdf__ = False # If True, skips prompt and only reads from CSV.

data_folder = Path("airbnb-recruiting-new-user-bookings/") # Replace if using Kaggle Notebook
hdf_path = data_folder / "test_data.h5"
use_hdf = not __no_hdf__ and (__no_prompt__ or input("Do you want to use an HDF file [Y/n]:") != "n") if hdf_path.exists() else False

if use_hdf: # Read from hdf if availible (much faster)
    df = pd.read_hdf(hdf_path)
else:
    filepath = data_folder / "test_users.csv"
    dtypes={'id': 'object', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
    parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']
    cols = list(pd.read_csv(filepath, nrows=1))[1:]
    df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', '<NA>'], parse_dates=parse_dates
                     , infer_datetime_format=True)
    df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking'])
                                              , format='%Y-%m-%d', errors='coerce')

#     df.select_dtypes('datetime64[ns]').fillna(pd.NaT)
    # df = df.set_index('id')
    df['age'] = df['age'].replace(range(2000, 2015), np.nan)
    df['age'] = pd.cut(df['age'], bins = [i*5 for i in range(0, 21)] + [120]
                       , labels=(['%d-%d' % (i*5, i*5+4) for i in range(0, 20)] + ['100+']))
    
#     filepath = data_folder / "age_gender_bkts.csv"
#     dtypes = {}
    
    filepath = data_folder / "sessions.csv"
    dtypes={'user_id': 'string', 'action': 'category', 'action_type': 'category', 'action_detail': 'category', 'device_type': 'category', 'secs_elapsed': 'float64'}
    df_session = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'])
    
    S = df_session.groupby(by='user_id', as_index=False).aggregate(lambda x:x.tolist())
    
    id_list = []
    types_list = []
    for row in S.values:
        id_list.append(row[0])
        types_list.append(np.unique(row[2]))
        
    rows = []
    for user_id, types in zip(id_list, types_list):
        types = np.delete(types, np.where(types == 'nan'))
        row = [user_id]
        for col in df_session['action_type'].cat.categories:
            if col in types:
                row.append(1)
            else:
                row.append(0)
        rows.append(row)
        
    t_cols = ['user_id'] + list(df_session['action_type'].cat.categories)
    t_dtypes = {**{'user_id': 'string'}, **{col: 'category' for col in t_cols[1:]}}
    df_action_types = pd.DataFrame(np.asarray(rows), columns=t_cols).astype(t_dtypes)
    
    df_merge = df.merge(df_action_types, left_on ='id', right_on='user_id').drop(columns=['user_id'])
    df_merge = df_merge.set_index('id')
    
    # Save to hdf if reading from csv
    df.to_hdf(data_folder / "test_data.h5", key='df', mode='w', format="table")

Do you want to use an HDF file [Y/n]:n




Wall time: 46.7 s


In [112]:
X_kaggle = df_merge.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking']) # TODO: Fix datetimes fucking everthing up.
id_list = np.asarray(X_kaggle.index)

In [113]:
filename = Path("models/LogisticRegression.pkl")
loaded_model = joblib.load(filename)
le_filename = Path("models/LabelEncoder.pkl")
y_enc = joblib.load(le_filename)

In [114]:
%%time
y_kaggle = loaded_model.predict_proba(X_kaggle)
# y_kaggle = np.asarray([[0, 0, 0, 0, .0625, 0, 0.03125, .5, 0, 0, .25, .125]]*df.shape[0])

Wall time: 398 ms


In [115]:
ranks = y_kaggle.argsort()[:, :6:-1]

In [117]:
orders = []
for rank in ranks:
    orders.append(y_enc.inverse_transform(rank))

In [118]:
with open("submission.csv", "w", newline="") as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
    csv_writer.writerow(['id', 'country'])
    for i, order in zip(id_list, orders):
        for country in order:
            row = [i, country]
            csv_writer.writerow(row)