# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Airbnb New User Bookings dataset. This can be accessed from: https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings/data

## Kaggle Performance Info
Kaggle's scoring for this competition utilizes a Normalized Discounted Cumulative Gain (NDCG) scoring, where up to 5 guesses of destination county (ordered by confidence) are submitted per entry in the test data. A score of 1.0 reflects the first guess being correct, and less points for other scenarios (0.63 for the second guess being correct and so on).

Q1/Q2/Q3 Kaggle scores: 0.85226 / 0.86555 / 0.86661

Notes:

The baseline, which is to always guess NDF-US-OTHER-FR-IT, recieves a NDCG score of 0.8219 (78th Percentile) on the Public Leaderboard. The NDCG score of the dummy model on the training data is 0.80676.

As for scikit's accuracy score, always guessing NDF results in a score of 0.58347 against the training data.

### Usage
Running the second code cell will prompt the user read data from CSV or HDF5 (setting \_\_fast_hdf__ will skip the prompt and use HDF5).

### Imports and reading dataset into memory

In [1]:
from os.path import exists

import numpy as np
import pandas as pd
import csv

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, ndcg_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import set_config
set_config(display='diagram')

import joblib

In [9]:
__fast_hdf__ = False # Skips user input

hdf_path = "airbnb-recruiting-new-user-bookings/data.h5"
use_hdf = __fast_hdf__ or input("Do you want to use an HDF file [Y/n]:") != "n" if exists(hdf_path) else False

# Read from hdf if availible (much faster)
if use_hdf:
    df = pd.read_hdf(hdf_path)
else:
    filepath = "airbnb-recruiting-new-user-bookings/train_users_2.csv"
    dtypes={'id': 'string', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
    parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']
    cols = list(pd.read_csv(filepath, nrows=1))[1:]
    df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', '<NA>'], parse_dates=parse_dates
                     , infer_datetime_format=True) # usecols =[i for i in cols if i != 'id']
    df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking'])
                                              , format='%Y-%m-%d', errors='coerce')

#     df.select_dtypes('datetime64[ns]').fillna(pd.NaT)
    df = df.set_index('id')
    df['age'] = df['age'].replace(range(2000, 2015), np.nan)
    df['age'] = pd.cut(df['age'], bins = [i*5 for i in range(0, 21)] + [120]
                       , labels=(['%d-%d' % (i*5, i*5+4) for i in range(0, 20)] + ['100+']))
    
#     filepath = "airbnb-recruiting-new-user-bookings/age_gender_bkts.csv"
#     dtypes = {}
    
#     filepath = "airbnb-recruiting-new-user-bookings/sessions.csv"
#     dtypes={'id': 'string', 'action': 'category', 'action_type': 'category', 'action_detail': 'category', 'device_type': 'category', 'sec_elapsed': 'float64'}
#     df_session = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'])
    
    # Save to hdf if reading from csv
    df.to_hdf("airbnb-recruiting-new-user-bookings/data.h5", key='df', mode='w', format="table")

Do you want to use an HDF file [Y/n]:


In [None]:
# df.loc(lambda x: df['age'] == 1995)

In [None]:
# df_session['action'].cat.categories

In [None]:
# df_session['action_type'].cat.categories

In [None]:
# df_session['action_detail'].cat.categories

In [None]:
# for cat in df_session['action_type'].cat.categories:
#     new_column = pd.Series(name=cat, dtype='int8')
#     df.insert(new_column)

In [None]:
# df.loc['gxn3p5htnn']

In [None]:
# Save to hdf if reading from csv
# df.to_hdf("airbnb-recruiting-new-user-bookings/data.h5")

### Diagnostics

In [None]:
print("- - test_users_2.csv - -")
print("Number of lines present: ", len(df))
print("Number of Columns: ", len(df.columns))

In [None]:
topCount = 5
print("Top ", topCount, " dataFrames:")
print(df.head(topCount))

In [None]:
df.memory_usage(deep=True, index=False).sort_values(ascending=False)

In [None]:
df.memory_usage(deep=True, index=False).sum()

### NDCG Implementation

(1, 0, 0, 0, 0) will serve as the true ranking.

The prediction (NDF-US-OTHER-FR-IT) will be used as a dummy model. It will be transformed into an ndarray such that incorrect guesses are transformed to 0 and correct guesses to 1.

A first-rank correct guess generally appears as (1, 0, 0, 0, 0), a second-rank correct guess as (0, 1, 0, 0, 0), and no correct guess as (0, 0, 0, 0, 0).

In [None]:
# NDCG fot the dummy model
true_relevance = np.asarray([[1, 0, 0, 0, 0]]*df.shape[0])

dp = pd.Series(['NDF', 'US', 'other', 'FR', 'IT'])

scores = []
for item in df['country_destination']:
    scores.append(np.asarray([dp.where(dp == item, other=0).replace(item, 1)]))
        
scores = np.vstack(scores)
print(ndcg_score(true_relevance, scores, k=5))

### Preprocessing

In [3]:
X = df.drop(columns=['country_destination'])
X = X.drop(columns=parse_dates) # TODO: Fix datetimes fucking everthing up.
y = df['country_destination']

In [4]:
# This block can be commented out when doing prediction on the Kaggle test.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=0, shuffle=True)

In [5]:
# numeric_transformer = make_pipeline(
# #     SimpleImputer(strategy='mean'),
# #     StandardScaler()
# )

encoder = make_column_transformer(
#     (numeric_transformer, ['age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')),
    remainder='passthrough'
)
encoder

In [6]:
y_enc = LabelEncoder().fit(y_train.values)
y_train = y_enc.transform(y_train)
y_test = y_enc.transform(y_test.values)

### Logistic Regression

In [7]:
clf0 = LogisticRegression(max_iter=300)

pipeline0 = make_pipeline(encoder, clf0)
pipeline0

In [8]:
# Fit the pipeline on the training data
pipeline0.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline0.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6309938336477708

#### LR with GridSearch

In [None]:
params = {
    'logisticregression__C': [0.1, 1.0, 10.0, 100.0],
    'logisticregression__penalty': ['l1', 'l2'] # ,
#     'columntransformer__pipeline__simpleimputer__strategy': ["mean", "median"]
}

grid_search = GridSearchCV(pipeline0, params, n_jobs=-1, verbose=1, cv=10)
# pipeline0.get_params().keys()

In [None]:
grid_search.fit(X_train, y_train)

grid_search.score(X_test, y_test)

### Decision Tree Classifier

In [None]:
clf1 = DecisionTreeClassifier(random_state=0)

pipeline1 = make_pipeline(encoder, clf1)
pipeline1

In [None]:
# Fit the pipeline on the training data
pipeline1.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline1.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline1, "models/CategoricalNB.pkl")

### SVC

In [None]:
clf2 = SVC()

pipeline2 = make_pipeline(encoder, clf2)
pipeline2

In [None]:
# Fit the pipeline on the training data
pipeline2.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline2.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline2, "models/SVC.pkl")

### Neural Network

In [None]:
clf3 = MLPClassifier(random_state=1, max_iter=300)

pipeline3 = make_pipeline(encoder, clf3)
pipeline3

In [None]:
# Fit the pipeline on the training data
pipeline3.fit(X_train, y_train)

In [None]:
# Score the pipeline on the testing data
pipeline3.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline3, "models/MLPClassifier.pkl")

In [None]:
# clf3 = KMeans(n_clusters=3, max_iter=100, random_state=0)
# clf3.fit(X_train, y_train)

## Predictions for Kaggle's test_users.csv

In [None]:
# filepath = "airbnb-recruiting-new-user-bookings/test_users_.csv"
# df = pd.read_csv(filepath, dtype=dtypes[:-1], na_values=['-unknown-', 'NDF', '<NA>'], parse_dates=parse_dates, infer_datetime_format=True)

In [None]:
# id_list = list(df['MachineIdentifier'])
# X_kaggle = df.drop(columns=['MachineIdentifier']))

In [None]:
# y_pred = pipeline.transform(X_kaggle.values)

In [None]:
# with open("submission.csv", "w", newline="") as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
#     csv_writer.writerows(zip(id_list, y_pred))