In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

In [None]:
train = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
train.shape, test.shape

# Summary statistics
- countries.csv : summary statistics of destination countries in this dataset and their locations
- age_gender_bkts.csv : summary statistics of users' age group, gender, country of destination

Before exploring the training data, we first have a look on the summary data.

The most popular destination is US, Germany & France respectively, and users aged 45-54 occupy the biggest proportion. There is no significant difference in user no between male and female.

In [None]:
country = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
age_gender = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')

In [None]:
country

In [None]:
age_gender

In [None]:
fig = px.treemap(age_gender, path=['country_destination','gender','age_bucket'], values='population_in_thousands',
                title='Distribution by country, gender & age')
fig.show()

In [None]:
age_gender_ratio = age_gender.groupby(['gender','age_bucket']).population_in_thousands.sum().reset_index()
order = {'100+':21, '95-99':20, '90-94':19, '85-89':18, '80-84':17, '75-79':16, '70-74':15, '65-69':14, 
         '60-64':13, '55-59':12, '50-54':11, '45-49':10, '40-44':9, '35-39':8, '30-34':7, '25-29':6, 
         '20-24':5, '15-19':4, '10-14':3, '5-9':2, '0-4':1}

age_gender_ratio = age_gender.groupby(['gender','age_bucket']).population_in_thousands.sum().reset_index()
age_gender_ratio['order'] = age_gender_ratio['age_bucket'].map(order)
fig = px.histogram(age_gender_ratio.sort_values('order'), x="age_bucket", y="population_in_thousands", color="gender")
fig.show()

# Data cleansing
- Age: restrict age range between 18 and 100
- Gender: convert all cells marked by "unknown" into nan
- First_browser: convert all cells marked by "unknown" into nan
- date_account_created: convert to datetime format
- timestamp_first_active: convert to datetime format

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

Whether the data are valid is also important.
It's nonsense that people aged 1 or even 2xxx are able to register, probably a typo instead.
To keep it simple, just set users who aged outside the range 18-100 as nan.

In [None]:
train.age.describe()

In [None]:
test.age.describe()

In [None]:
train.loc[~train['age'].between(18,100),'age'] = np.nan
test.loc[~test['age'].between(18,100),'age'] = np.nan

Although there is no null cells in gender column, but they are actually filled by "-unknown-".
In order not to confuse the model, will convert them into nan.

In [None]:
train.gender.unique(), test.gender.unique()

In [None]:
train.gender.replace('-unknown-', np.nan, inplace=True)
test.gender.replace('-unknown-', np.nan, inplace=True)

Similar case happen in "first_browser"

In [None]:
train.first_browser.unique(), test.first_browser.unique()

In [None]:
train.first_browser.replace('-unknown-', np.nan, inplace=True)
test.first_browser.replace('-unknown-', np.nan, inplace=True)

In [None]:
#convert columns into datetime format first
#only extract year, month, day from the active date
train['date_account_created'] = pd.to_datetime(train['date_account_created'], format='%Y-%m-%d')
train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'].astype(str).str[:8])
test['date_account_created'] = pd.to_datetime(test['date_account_created'], format='%Y-%m-%d')
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'].astype(str).str[:8])
train['date_first_booking'] = pd.to_datetime(train['date_first_booking'], format='%Y-%m-%d')
test['date_first_booking'] = pd.to_datetime(test['date_first_booking'], format='%Y-%m-%d')

# EDA

age doesn't place a great difference on gender ratio.
Test dataset consists of more younger users.
Unlike 2015 statistics, where middle-aged (45-54) has bigger proportion. It's interesting that more and more elder people are willing to use Airbnb.

In [None]:
print("train's age statistics")
train.age.describe()

In [None]:
print("test's age statistics")
test.age.describe()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))
fig.suptitle('Difference of age_gender distribution between train/test dataset')
sns.boxplot(x='gender', y='age', data=train, ax=ax[0])
ax[0].set_title('train')
sns.boxplot(x='gender', y='age', data=test, ax=ax[1])
ax[1].set_title('test')

# Day diff (1st activity, sign-up, 1st booking) 

- Definition

timestamp_first_active: timestamp of the first activity, note that it can be earlier than date_account_created or date_first_booking because a user can search before signing up

- Interpretation

It's quite surprising that ppl usually sign up after first activity interacting with Airbnb, maybe because ppl need to sign in to unlock some features/view the accommodation.

Usually, users have their first booking 44 days after sign-up

In [None]:
sign_diff_train = train['timestamp_first_active']-train['date_account_created']
sign_diff_test = test['timestamp_first_active']-test['date_account_created']

In [None]:
book_diff_train = train['date_first_booking']-train['date_account_created']
book_diff_test = test['date_first_booking']-test['date_account_created']

In [None]:
sign_diff_train.mean().round('H')

In [None]:
sign_diff_test.describe()

In [None]:
pd.DataFrame((['Day diff between 1st activity & sign-up',sign_diff_train.mean().round('H'),sign_diff_test.mean().round('H')],
              ['Day diff between sign-up & 1st booking',book_diff_train.mean().round('H'),book_diff_test.mean().round('H')]),
             columns=[' ', 'Train', 'Test'])

# country VS tendency on destination
- 'NDF' --> no booking

In [None]:
lang = train.groupby(['language','country_destination']).id.count().reset_index()

In [None]:
plt.figure(figsize=(10,10))
fig = sns.heatmap(lang.pivot_table(values='id',index='language',columns='country_destination',aggfunc='sum'), cmap='Reds')
plt.text(0, -2,"Where do they go for their 1st trip of different countries?", family='sans-serif', fontsize=15, weight='bold')

* px.sunburst: Note that id and parent should not be provided if path is given

As I have used the column "ID" to sum up the no of user, it is necessary to rename the column "ID", othervise, the values cannot be shown.

In [None]:
lang = lang.rename(columns={'id':'no'})

In [None]:
px.pie(train.groupby('language').id.count().sort_values(ascending=False).reset_index(), values='id', names='language')

Generally speacking, english speakers occupy a major proportion of over 95%, and in between, most of them hadn't made an appointmnet yet. The next popular destination of english speaker is US. It may be due to Airbnb is originated from US and getting popularity first in US. Also, it may be common for US ppl (3rd biggest populartion in the world) use Airbnb for domestic travel. 

Notably, around 58% users still haven't made an booking yet, so it's predictable that may be half of the prediction will be "NDF"(no booking) afterwards.

In [None]:
fig = px.sunburst(lang, path=['language','country_destination'], values='no')
fig.show()

In [None]:
print("% of users without booking: {}%".
      format(round((lang[lang['country_destination']=='NDF'].no.sum()/lang.no.sum())*100,2)))

In [None]:
train.groupby('language').id.count().sort_values(ascending=False)

Most popular destination of english speaker will be US, other countries not listed, France, Italy, Germany.
As english speaker dominate in the whole dataset, it's difficult to see the preference of other language speakers, so I draw another graph excluding the english speakers.

For non-english speakers, US is still the most popular destination. It's not saying that all people around the world always hope to visit US, as the dataset only focus on booking in Airbnb (which is founded in US). It is no doubt that there are more Airbnb hosts in US, in turn, more supply for visiters to choose Airbnb as accommodation.

Besides, it is interesting that their home country is usually the secondest popular destination (eg French choose France, German choose Germany) except spanish speakers, probably because there are still many ppl outside Spain speak spanish, such as South America.

In [None]:
lang_with_booking = lang[lang['country_destination']!='NDF']

In [None]:
fig = px.bar(lang_with_booking, x="language", y="no", color="country_destination", title="Where do differnt language speaker usually go?")
fig.show()

In [None]:
lang_with_booking_exclude_us = lang_with_booking[lang_with_booking['language']!='en']

In [None]:
fig = px.bar(lang_with_booking_exclude_us, x="language", y="no", color="country_destination", title="Where do non-english language speaker usually go?")
fig.show()

# Feature engineering

Some features have already been massaged in data cleansing step for the ease of EDA.
Before modelling, some features are processed.

- id: drop
- date_first_booking: drop
- date_account_created: replace with 3 columns(year, month, day), drop the original timestamp column
- timestamp_first_active: replace with 3 columns(year, month, day), drop the original timestamp column
- missing value: fill with 0

In [None]:
category = list(test.dtypes[test.dtypes == 'object'].reset_index()['index'])

In [None]:
print("Do object columns have the same values in both train & test dataset?")
for i in category:
    print("{}: {}".format(i,train[i].unique() == test[i].unique()))

In [None]:
#combine train-test 
df = pd.concat((train, test), axis=0, ignore_index=True)
df.drop(['id', 'date_first_booking'], axis=1, inplace=True)

#mark the size of train set for afterward spliting
train_size = train.shape[0]

# Split sign-up date into year, month, day
df['sign_up_year'] = df.date_account_created.dt.year
df['sign_up_month'] = df.date_account_created.dt.month
df['sign_up_day'] = df.date_account_created.dt.day

# Split active timestamp into year, month, day
df['active_year'] = df.timestamp_first_active.dt.year
df['active_month'] = df.timestamp_first_active.dt.month
df['active_day'] = df.timestamp_first_active.dt.day

#drop original datetime column
df.drop('date_account_created', axis=1, inplace=True)
df.drop('timestamp_first_active', axis=1, inplace=True)

#fill the missing values by 0
df = df.fillna(0)

#drop target column
df.drop("country_destination", axis=1, inplace=True)

In [None]:
#One-hot-encoding
category = list(df.dtypes[df.dtypes == 'object'].reset_index()['index'])
def convertion(df):
    for col in category:
        dummy = pd.get_dummies(df[col], prefix=col)
        df.drop([col], axis=1, inplace=True)
        df = pd.concat((df, dummy), axis=1)
    return df

In [None]:
df = convertion(df)

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
X = df[:train_size]
le = LabelEncoder()
y = le.fit_transform(train["country_destination"])

In [None]:
#split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=46)

# XGboost hyperparameter

- n_estimators: #of gradient boosted trees, Equivalent to no of boosting rounds
- max_depth: max. tree depth
- learning_rate: aka. “eta”
- objective: specify learning task & objective
- subsample: subsample ratio of the training instance.
- colsample_bytree: subsample ratio of columns when constructing each tree
- seed: generate the folds

sources: https://xgboost.readthedocs.io/en/latest/python/python_api.html

In [None]:
xgb = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.5, seed=1)                  
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)

In [None]:
mean_absolute_error(y_val, y_pred)

# Test

In [None]:
y_test = df[train_size:]

In [None]:
prediction = xgb.predict(y_test)

In [None]:
submission = {'id': np.array(test.id), 'country': le.inverse_transform(prediction)}
pd.DataFrame(submission).to_csv('Airbnb submission.csv', index=False)

The final score is 0.71150