# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Airbnb New User Bookings dataset. This can be accessed from: https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings/data

## Kaggle Performance Info
Kaggle's scoring for this competition utilizes a Normalized Discounted Cumulative Gain (NDCG) scoring, where up to 5 guesses of destination county (ordered by confidence) are submitted per entry in the test data. A score of 1.0 reflects the first guess being correct, and less points for other scenarios (0.63 for the second guess being correct and so on).

Q1/Q2/Q3 Kaggle scores: 0.85226 / 0.86555 / 0.86661

Notes:

The baseline, which is to always guess NDF-US-OTHER-FR-IT, recieves a NDCG score of 0.8219 (78th Percentile) on the Public Leaderboard. The NDCG score of the dummy model on the training data is 0.80676.

As for scikit's accuracy score, always guessing NDF results in a score of 0.58347 against the training data.

### Usage
Running the second code cell will prompt the user read data from CSV or HDF5 (setting \_\_fast_hdf__ will skip the prompt and use HDF5).

### Imports and reading dataset into memory

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import csv

from statistics import mean

from sklearnex import patch_sklearn 
patch_sklearn()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
# from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline # , Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# from sklearn.metrics import ndcg_score # , classification_report

# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingGridSearchCV

# from tune_sklearn import TuneGridSearchCV

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier

from sklearn import set_config
set_config(display='diagram')

import xgboost as xgb
# from xgboost import XGBClassifier

import joblib

from joblib import parallel_backend
# from ray.util.joblib import register_ray

In [None]:
%%time
__no_prompt__ = False # If True, skips user input (defaults to HDF)
__no_hdf__ = False # If True, only reads from CSV

data_folder = Path("airbnb-recruiting-new-user-bookings/") # Replace if using Kaggle Notebook
hdf_path = data_folder / "data.h5"
use_hdf = __no_prompt__ or input("Do you want to use an HDF file [Y/n]:") != "n" if hdf_path.exists() else False

# Read from hdf if availible (much faster)
if use_hdf and not __no_hdf__:
    df = pd.read_hdf(hdf_path)
else:
    filepath = data_folder / "train_users_2.csv"
    dtypes={'id': 'string', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
    parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']
    cols = list(pd.read_csv(filepath, nrows=1))[1:]
    df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', '<NA>'], parse_dates=parse_dates
                     , infer_datetime_format=True) # usecols =[i for i in cols if i != 'id']
    df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking'])
                                              , format='%Y-%m-%d', errors='coerce')

#     df.select_dtypes('datetime64[ns]').fillna(pd.NaT)
    df = df.set_index('id')
    df['age'] = df['age'].replace(range(2000, 2015), np.nan)
    df['age'] = pd.cut(df['age'], bins = [i*5 for i in range(0, 21)] + [120]
                       , labels=(['%d-%d' % (i*5, i*5+4) for i in range(0, 20)] + ['100+']))
    
#     filepath = data_folder / "age_gender_bkts.csv"
#     dtypes = {}
    
#     filepath = data_folder / "sessions.csv"
#     dtypes={'id': 'string', 'action': 'category', 'action_type': 'category', 'action_detail': 'category', 'device_type': 'category', 'sec_elapsed': 'float64'}
#     df_session = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'])
    
    # Save to hdf if reading from csv
    df.to_hdf(data_folder / "data.h5", key='df', mode='w', format="table")

In [None]:
# df.loc(lambda x: df['age'] == 1995)

In [None]:
# df_session['action'].cat.categories

In [None]:
# df_session['action_type'].cat.categories

In [None]:
# df_session['action_detail'].cat.categories

In [None]:
# for cat in df_session['action_type'].cat.categories:
#     new_column = pd.Series(name=cat, dtype='int8')
#     df.insert(new_column)

In [None]:
# df.loc['gxn3p5htnn']

### Diagnostics

In [None]:
print("- - test_users_2.csv - -")
print("Number of lines present: ", len(df))
print("Number of Columns: ", len(df.columns))

In [None]:
topCount = 5
print("Top ", topCount, " dataFrames:")
print(df.head(topCount))

In [None]:
df.memory_usage(deep=True, index=False).sort_values(ascending=False)

In [None]:
df.memory_usage(deep=True, index=False).sum()

### NDCG Implementation

(1, 0, 0, 0, 0) will serve as the true ranking.

The prediction (NDF-US-OTHER-FR-IT) will be used as a dummy model. It will be transformed into an ndarray such that incorrect guesses are transformed to 0 and correct guesses to 1.

A first-rank correct guess generally appears as (1, 0, 0, 0, 0), a second-rank correct guess as (0, 1, 0, 0, 0), and no correct guess as (0, 0, 0, 0, 0).

In [None]:
def dcg(r, k):
    r = np.asfarray(r)[:k]
    return np.sum(r / np.log2(np.arange(2, r.size + 2)))

def ndcg(r, k):
    dcg_max = dcg(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0
    return dcg(r, k) / dcg_max

def ndcg_score(y_pred, y_true):
    if len(y_pred) != len(y_true):
        raise Exception("Array lengths do not match: " + str(y_pred.shape) + ", " + str(y_true.shape))
    rank = []
    for i in range(len(y_true)):
        y = pd.Series(y_pred[i])
        rank.append([y.where(y == y_true[i], other=0).replace(y_true[i], 1)])
    rank = np.vstack(rank)
    return mean([ndcg(r, len(y_pred[0])) for r in rank])

In [None]:
# NDCG for the dummy model
y_pred = [['NDF', 'US', 'other', 'FR', 'IT']]*df.shape[0]

ndcg_score(y_pred, df['country_destination'])

### Preprocessing

In [None]:
X = df.drop(columns=['country_destination'])
X = X.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking']) # TODO: Fix datetimes fucking everthing up.
y = df['country_destination']

In [None]:
# This block can be commented out when doing prediction on the Kaggle test.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=0, shuffle=True)

In [None]:
# numeric_transformer = make_pipeline(
# #     SimpleImputer(strategy='mean'),
# #     StandardScaler()
# )

encoder = make_column_transformer(
#     (numeric_transformer, ['age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')),
    remainder='passthrough'
)
encoder

In [None]:
y_enc = LabelEncoder().fit(y_train.values)
y_train = y_enc.transform(y_train)
y_test = y_enc.transform(y_test.values)

### Logistic Regression

In [None]:
clf0 = LogisticRegression(max_iter=300)

pipeline0 = make_pipeline(encoder, clf0)
pipeline0

In [None]:
%%time
# Fit the pipeline on the training data
pipeline0.fit(X_train, y_train)

# Predict probabilities
y_rank = pipeline0.predict_proba(X_test).argsort()[:, :5:-1]

In [None]:
%%time
ndcg_score(y_rank, y_test)

#### LR with GridSearchCV

In [None]:
# register_ray()

In [None]:
params = {
    'logisticregression__C': [0.1, 1.0, 10.0, 100.0],
    'logisticregression__penalty': ['l2'], # ,l1
    'logisticregression__solver': ['lbfgs', 'liblinear', 'saga']
#     'columntransformer__pipeline__simpleimputer__strategy': ["mean", "median"]
}

grid_search = GridSearchCV(pipeline0, params, n_jobs=-1, verbose=1, cv=5)
# pipeline0.get_params().keys()

In [None]:
%%time
# with parallel_backend("ray"):
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_rank = grid_search.predict_proba(X_test).argsort()[:, :5:-1]
ndcg_score(y_rank, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline0, "models/LogisticRegression.pkl")

### Decision Tree Classifier

In [None]:
clf1 = DecisionTreeClassifier(random_state=0)

pipeline1 = make_pipeline(encoder, clf1)
pipeline1

In [None]:
# Fit the pipeline on the training data
pipeline1.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline1.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline1, "models/CategoricalNB.pkl")

### SVC

In [None]:
clf2 = SVC()

pipeline2 = make_pipeline(encoder, clf2)
pipeline2

In [None]:
%%time
# Fit the pipeline on the training data
pipeline2.fit(X_train, y_train)

In [None]:
%%time
# Score the pipeline on the testing data
pipeline2.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline2, "models/SVC.pkl")

### Neural Network

In [None]:
clf3 = MLPClassifier(random_state=1, max_iter=300)

pipeline3 = make_pipeline(encoder, clf3)
pipeline3

In [None]:
%%time
# Fit the pipeline on the training data
pipeline3.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
pipeline3.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline3, "models/MLPClassifier.pkl")

### Random Forest

In [None]:
clf4 = RandomForestClassifier(random_state=0)

pipeline4 = make_pipeline(encoder, clf4)
pipeline4

In [None]:
%%time
# Fit the pipeline on the training data
pipeline4.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
pipeline4.score(X_test, y_test)

In [None]:
pipeline4.predict_proba(X_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline4, "models/RandomForestClassifier.pkl")

### SGD Classifier

In [None]:
clf5 =SGDClassifier(random_state=0)

pipeline5 = make_pipeline(encoder, clf5)
pipeline5

In [None]:
%%time
# Fit the pipeline on the training data
pipeline5.fit(X_train, y_train)

In [None]:
# pipeline5.get_params()

In [None]:
params = {
    ''
    'sgdclassifier__loss': ['hinge', 'squared_hinge', 'modified_huber', 'perceptron'],
    'sgdclassifier__penalty': ['l2', 'l1', 'elasticnet'],
    'sgdclassifier__alpha': [1e-4, 1e-3, 1e-1],
    'sgdclassifier__epsilon': [0.01, 0.1]
}
grid_search = GridSearchCV(pipeline5, params, n_jobs=-1, verbose=1, cv=5)
# pipeline0.get_params().keys()

In [None]:
%%time
# with parallel_backend("ray"):
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
y_rank = grid_search.predict_proba(X_test).argsort()[:, :5:-1]
ndcg_score(y_rank, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline5, "models/SGDClassifier.pkl")

### XGBoostClassifier

In [None]:
clf6 =xgb.XGBClassifier(objective='mulit:softprob')

pipeline6 = make_pipeline(encoder, clf6)
pipeline6

In [None]:
%%time
# Fit the pipeline on the training data
pipeline6.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
y_rank = pipeline6.predict_proba(X_test).argsort()[:, :5:-1]
ndcg_score(y_rank, y_test)

In [None]:
# Pickle model and write to hard drive
# joblib.dump(pipeline6, "models/XGBClassifier.pkl")

### K-Means

In [None]:
# clf5 = KMeans(n_clusters=3, max_iter=100, random_state=0)
# clf5.fit(X_train, y_train)

## Predictions for Kaggle's test_users.csv

In [None]:
# filepath = "airbnb-recruiting-new-user-bookings/test_users_.csv"
# df = pd.read_csv(filepath, dtype=dtypes[:-1], na_values=['-unknown-', 'NDF', '<NA>'], parse_dates=parse_dates, infer_datetime_format=True)

In [None]:
# id_list = list(df['MachineIdentifier'])
# X_kaggle = df.drop(columns=['MachineIdentifier']))

In [None]:
# y_pred = pipeline.transform(X_kaggle.values)

In [None]:
# with open("submission.csv", "w", newline="") as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
#     csv_writer.writerows(zip(id_list, y_pred))