# Final Project Notebook
Group: 9
Group Members: Shawn Ericksen (ericksen@uark.edu), Garret Fulghum (gmfulghu@uark.edu), Wesley Parker (wgparker@uark.edu)

This practice project focuses on the Airbnb New User Bookings dataset. This can be accessed from: https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings/data

## Kaggle Performance Info
Q1 Score: 0.85226

Median Score: 0.86555

Q3 Score: 0.86661

90th Percentile: 0.88024

### Imports and reading dataset into memory

In [1]:
import numpy as np
import pandas as pd
import csv

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import set_config
set_config(display='diagram')

import joblib

In [2]:
dtypes={'id': 'string', 'date_account_created': 'string', 'timestamp_first_active': 'string', 'date_first_booking': 'string', 'gender': 'category', 'age': 'float64', 'signup_method': 'category', 'signup_flow': 'category', 'language': 'category', 'affiliate_channel': 'category', 'affiliate_provider': 'category', 'first_affiliate_tracked': 'category', 'signup_app': 'category', 'first_device_type': 'category', 'first_browser': 'category', 'country_destination': 'category'}
parse_dates = ['date_account_created', 'timestamp_first_active', 'date_first_booking']

In [3]:
filepath = "airbnb-recruiting-new-user-bookings/train_users_2.csv"
cols = list(pd.read_csv(filepath, nrows=1))[1:]
df = pd.read_csv(filepath, dtype=dtypes, na_values=['-unknown-', 'NDF', '<NA>'], usecols =[i for i in cols if i != 'id'], parse_dates=parse_dates, infer_datetime_format=True)
df['date_first_booking'] = pd.to_datetime(pd.Series(df['date_first_booking']), format='%Y-%m-%d', errors='coerce')

# df.select_dtypes('datetime64[ns]').fillna(pd.NaT)

# for i in df.select_dtypes('datetime64[ns]').columns:
#     df[i].fillna(pd.NaT)

### Diagnostics

In [None]:
print("- - Adult.Data - -")
print("Number of lines present: ", len(df))
print("Number of Columns: ", len(df.columns))

In [None]:
topCount = 5
print("Top ", topCount, " dataFrames:")
print(df.head(topCount))

In [None]:
df.memory_usage(deep=True, index=False).sort_values(ascending=False)

In [None]:
df.memory_usage(deep=True, index=False).sum()

### Preprocessing

In [4]:
X = df.drop(columns=['country_destination'])
X = X.drop(columns=parse_dates) # TODO: Fix datetimes fucking everthing up.
y = df['country_destination']

In [5]:
# This block can be commented out when doing prediction on the Kaggle test.csv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=0, shuffle=True)

In [6]:
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

encoder = make_column_transformer(
    (numeric_transformer, ['age']),
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), make_column_selector(dtype_include='category')),
    remainder='passthrough'
)
encoder

In [7]:
y_enc = LabelEncoder().fit(y_train.values)
y_train = y_enc.transform(y_train)
y_test = y_enc.transform(y_test.values)

### Logistic Regression

In [8]:
clf0 = LogisticRegression(max_iter=300)

pipeline0 = make_pipeline(encoder, clf0)
pipeline0

In [13]:
# Fit the pipeline on the training data
pipeline0.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline0.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6045051090823911

#### LR with GridSearch

In [11]:
params = {
    'logisticregression__C': [0.1, 1.0, 10.0, 100.0],
    'logisticregression__penalty': ['l1', 'l2'],
    'columntransformer__pipeline__simpleimputer__strategy': ["mean", "median"]
}

grid_search = GridSearchCV(pipeline0, params, n_jobs=-1, verbose=1, cv=10)
# pipeline0.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'logisticregression', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__pipeline', 'columntransformer__onehotencoder', 'columntransformer__pipeline__memory', 'columntransformer__pipeline__steps', 'columntransformer__pipeline__verbose', 'columntransformer__pipeline__simpleimputer', 'columntransformer__pipeline__standardscaler', 'columntransformer__pipeline__simpleimputer__add_indicator', 'columntransformer__pipeline__simpleimputer__copy', 'columntransformer__pipeline__simpleimputer__fill_value', 'columntransformer__pipeline__simpleimputer__missing_values', 'columntransformer__pipeline__simpleimputer__strategy', 'columntransformer__pipeline__simpleimputer__verbose', 'columntransformer__pipeline__standardsc

In [12]:
grid_search.fit(X_train, y_train)

grid_search.score(X_test, y_test)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6045051090823911

### Decision Tree Classifier

In [None]:
clf1 = DecisionTreeClassifier(random_state=0)

pipeline1 = make_pipeline(encoder, clf1)
pipeline1

In [None]:
# Fit the pipeline on the training data
pipeline1.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline1.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline1, "models/CategoricalNB.pkl")

### SVC

In [None]:
clf2 = SVC()

pipeline2 = make_pipeline(encoder, clf2)
pipeline2

In [None]:
# Fit the pipeline on the training data
pipeline2.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline2.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline2, "models/SVC.pkl")

### Neural Network

In [None]:
clf3 = MLPClassifier(random_state=1, max_iter=300)

pipeline3 = make_pipeline(encoder, clf3)
pipeline3

In [None]:
# Fit the pipeline on the training data
pipeline3.fit(X_train, y_train)

# Score the pipeline on the testing data
pipeline3.score(X_test, y_test)

In [None]:
# Pickle model and write to hard drive
joblib.dump(pipeline3, "models/MLPClassifier.pkl")

In [None]:
# clf3 = KMeans(n_clusters=3, max_iter=100, random_state=0)
# clf3.fit(X_train, y_train)

## Predictions for Kaggle's test_users.csv

In [None]:
# filepath = "airbnb-recruiting-new-user-bookings/test_users_.csv"
# df = pd.read_csv(filepath, dtype=dtypes[:-1], na_values=['-unknown-', 'NDF', '<NA>'], parse_dates=parse_dates, infer_datetime_format=True)

In [None]:
# id_list = list(df['MachineIdentifier'])
# X_kaggle = df.drop(columns=['MachineIdentifier']))

In [None]:
# y_pred = pipeline.transform(X_kaggle.values)

In [None]:
# with open("submission.csv", "w", newline="") as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
#     csv_writer.writerows(zip(id_list, y_pred))