In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Airbnb is an online marketplace for arranging or offering lodging, primarily homestays, or tourism experiences. New users on Airbnb can book a place to stay in 34000+ cities and across 190+ countries. Through analyzing past data provided by Airbnb, accurately predicting where a new user will book their first travel. As a result, Airbnb can share more personalized content with their community, decrease the average time to first booking, and better forecast demand. In this case, we need to predict the first travel destination of a new user based on his/her registration information. 

Data:
1.	Train/Test_users.cvs: there are 16 features, include User_ID, gender, age, signup_method, language, affiliate_channel, etc.
2.	Age_gender_bkts.cvs: summary statistics of users' age group, gender, country of destination
3.	Countries.csv: summary statistics of destination countries and their locations
4.	Sessions.csv: web sessions log for users, includes user’s actions, device type, etc.

In [None]:
train = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
age_gender = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')
sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')

In [None]:
train.info()

In [None]:
train.head()

* Except for the destination column, there are 16 features including user id, date account created, timestamp first active, date first booking, gender, age, signup method, signup flow, language, affiliate channel, affiliate provider, first affiliate tracked, signup app, first device type, first browser. 
* Among those, date first booking is useless information because we want to predict before their first booking. Thus, for the first step, I drop ‘date_first_booking” and ‘country_destination’ column, and contatenate train and test dataset.
* Second, fill the missing data with its last value in the dataset. 

In [None]:
X_train = train.drop(['date_first_booking', 'country_destination'], axis=1)
X_test = test.drop(['date_first_booking'], axis=1)

In [None]:
y_des = train['country_destination'].values
X=pd.concat((X_train, X_test), axis=0, ignore_index=True)
X.shape

Fill missing data with its last value

In [None]:
X.fillna(method='pad').head()

In the age column, there are some unmeaningful values such as 1, 2014. We don’t want those unmeaningful data exist in our analysis. Therefore, I set those value beyond 13 to 90 as -1. 

In [None]:
X.loc[X.age > 90, 'age'] = -1
X.loc[X.age < 13, 'age'] = -1
X['age'].describe()

In [None]:
X.loc[X.age.isnull(), 'age']=X.age.mean()

Split 'date_account_created' as dac_year, dac_month, dac_day

In [None]:
dac = np.vstack(
    X.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))
    ).values
)
X['dac_year'] = dac[:, 0]
X['dac_month'] = dac[:, 1]
X['dac_day'] = dac[:, 2]
X = X.drop(['date_account_created'], axis=1)
X.head()

Add feature "session_count" to dataset.

In [None]:
df = sessions.user_id.value_counts()
print(df.shape)
print(df.head())

In [None]:
df = df.to_frame()
df = df.rename(columns = {'user_id' : 'session_count'})
df['id'] = df.index
df.head()

In [None]:
X = pd.merge(X, df, how = 'left', on = ['id'])
X.session_count.fillna(-1, inplace = True)
X.session_count = X.session_count.astype(int)

Split 'timestamp_first_active' as tfa_year, tfa_month, tfa_day

In [None]:
tfa = np.vstack(
    X.timestamp_first_active.astype(str).apply(
        lambda x: list(map(int, [x[:4], x[4:6], x[6:8],
                                 x[8:10], x[10:12],
                                 x[12:14]]))
    ).values
)
X['tfa_year'] = tfa[:, 0]
X['tfa_month'] = tfa[:, 1]
X['tfa_day'] = tfa[:, 2]
X = X.drop(['timestamp_first_active'], axis=1)

some visualizations

In [None]:
# age distributions
train['corrected_age']=train['age'].apply(lambda x : 36 if x>90 or x<10 else x)
sns.distplot(train.corrected_age.dropna())

In [None]:
# percentage of users using different signup_method
signup_method = X.signup_method.value_counts(dropna = False) / len(X) * 100
signup_method.plot('bar', rot = 0)
plt.xlabel('Sign up method')
plt.ylabel('Percentage of signup_method')

Percentage of Sign up methods used. This is the approach that users using to sign up for their new accounts, including basic, facebook, google and weibo. 

In [None]:
# percentage of gender
gender = X.gender.value_counts(dropna = False) / len(X) * 100
gender.plot('bar', rot = 0)
plt.xlabel('gender')
plt.ylabel('Percentage of gender')

Gender Counts. The classes of genders are unknown, female, male, and others. Unknowns are those that users didn’t provide their gender when they sign up. Count number of females is slightly greater than male. Few people are other. 

In [None]:
# percentage of people going to different countries
des_countries = train.country_destination.value_counts(dropna = False) / len(train) * 100
des_countries.plot('bar', rot = 0)
plt.xlabel('Destination country')
plt.ylabel('Percentage of booking')

Percentage of booking countries. NDF represents that people did not book for a trip or went to a country. Less than 30% went to US. Other countries all show small percentage of booking.

In [None]:
# Relavance between Age and destination
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 7)
sns.boxplot(y='age' , x='country_destination',data=train)
plt.xlabel('Destination Country box plot',size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)

The relationship between age and destination countries. Users booking for countries Spain, Portugal and Netherlands tend to be younger whereas Users booking for Great Britain tend to be older. 

In [None]:
# relevance between age and signup method
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='signup_method',data=train)
plt.xlabel('Signup method', size=15)
plt.ylabel('age', size=15)
plt.tick_params(labelsize=12)
#sns.despine()

In [None]:
# relevence between age and signup app
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='signup_app',data=train)
plt.xlabel('Signup app',size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)
#sns.despine()

People who using web to sign up for the account is older than those using other methods.

In [None]:
#relevence between age and language
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(8, 5)
sns.boxplot(y='age' , x='language',data=train)
plt.xlabel('Language', size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=12)
#sns.despine()

In [None]:
# relevance between age and gender
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(6, 4)
sns.boxplot(y='age' , x='gender',data=train)
plt.xlabel('Gender', size=15)
plt.ylabel('Age of Users', size=15)
plt.tick_params(labelsize=10)
#sns.despine()

There is no a significant difference of the age among the different genders. But the average of female is slightly less than other genders. 

In [None]:
# chart for number of account created
train['date_account_created_new'] = pd.to_datetime(train['date_account_created'])
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
train.date_account_created_new.value_counts().plot(kind='line', linewidth=1, color='#1F618D')
plt.xlabel('Date ', size=20)
plt.ylabel('Number of account created ', size=15)
plt.tick_params(labelsize=12)
#sns.despine()

 One hot coding--by get.dummies

In [None]:
oh_features = ['gender', 'signup_method', 'signup_flow', 'language',
                'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app',
                'first_device_type', 'first_browser']

In [None]:
for feature in oh_features:
    X_dummy = pd.get_dummies(X[feature], prefix=feature)
    X = X.drop([feature], axis=1)
    X = pd.concat((X, X_dummy), axis=1)
X.head()

In [None]:
#split the well processed dataset into X_train and X_test
X_train = X.iloc[:len(train), :]
X_test = X.iloc[len(train):, :]
X_train = X_train.drop(['id'], axis=1)
X_train.shape
X_test = X_test.drop(['id'], axis=1)

Label Encode target y colunm

In [None]:
le = LabelEncoder()
y_trans = le.fit_transform(y_des)
y_trans.shape

 train test split

In [None]:
dtrain, dtest, train_label, test_label = train_test_split(X_train, y_trans, test_size = 0.3, random_state = 817)

logistic regression

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(dtrain, train_label)
pred_log=logreg.predict(dtest)
from sklearn.metrics import accuracy_score
print(accuracy_score(test_label, pred_log))

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=20, n_estimators=100)
rfc.fit(dtrain , train_label)
pred = rfc.predict(dtest)
print(accuracy_score(test_label, pred))

In [None]:
fi=pd.Series(rfc.feature_importances_, index=dtrain.columns)
fn=fi.sort_values(ascending=True)
fn[-20:].plot(kind='barh', color='r', figsize=(25, 12))
plt.xlabel('importance', size=15)
plt.title('Random Forest Importance', size=20)
plt.tick_params(labelsize=15)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(dtrain , train_label)
pred = dtc.predict(dtest)
print(accuracy_score(test_label, pred))

XGBClassifier

In [None]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40)
xgb.fit(dtrain , train_label)
pred = xgb.predict(dtest) 
print(accuracy_score(test_label, pred))

Predict using XGBClassifier

In [None]:
# only XGBoost
xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40)
xgb.fit(X_train, y_trans)
XGBC_pred_test = xgb.predict(X_test)
XGBC_pred_test_prob=xgb.predict_proba(X_test)

In [None]:
ids_test = test['id']

ids = []
countries = []

for i in range(len(X_test)):
    idx = ids_test[i]
    ids += [idx] * 5
    countries += le.inverse_transform(np.argsort(XGBC_pred_test_prob[i])[::-1][:5]).tolist()

In [None]:
submission = pd.DataFrame({
    "id" : ids,
    "country" : countries
})
submission.to_csv('submission_XGBC.csv', index = False)

Predict by using XGBoost with cross Validate, score 0.86491 evaluated by kaggle

In [None]:
n_labels=len(set(y_des))
n_labels

In [None]:
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'merror',
    'num_class': n_labels,
    'eta': 0.5,
    'max_depth': 6,
    'subsample': 0.5,
    'colsample_bytree': 0.3,
    'silent': 1,
    'seed': 123
}

In [None]:
import xgboost as xgb
num_boost_round = 50

Dtrain = xgb.DMatrix(X_train, y_trans)
res = xgb.cv(params, Dtrain, num_boost_round=num_boost_round, nfold=5,
             callbacks=[xgb.callback.print_evaluation(show_stdv=True),
                        xgb.callback.early_stop(50)])

In [None]:
num_boost_round = res['test-merror-mean'].idxmin()
print(format(num_boost_round))
clf = xgb.train(params, Dtrain, num_boost_round=num_boost_round)
clf

In [None]:
import operator
importance = clf.get_fscore()
importance_df = pd.DataFrame(
    sorted(importance.items(), key=operator.itemgetter(1)),
    columns=['feature', 'fscore']
)

In [None]:
importance_df = importance_df.iloc[-20:, :]

In [None]:
plt.figure()
importance_df.plot(kind='barh', x='feature', y='fscore',
                   legend=False, figsize=(20, 10))
plt.title('XGBoost Feature Importance', size=25)
plt.xlabel('Relative importance', size=20)
plt.ylabel('Features', size=20)
plt.tick_params(labelsize=15)
#plt.gcf().savefig('feature_importance.png')

Findings:
I tried different classification algorithms. Similar performance among those different algorithms after adjusting their parameters to be the optimal. I finally choose to use XGBoosting to predict for the test dataset. It has 63% accuracy and a 86.5% score on kaggle. Developing high resolution cross validation did not help with better feature selection and model ensembling. I think this is because there is no overfitting by using simply XGBoost classifier. And cross validation is mainly help with avoiding overfittings so that it doesn’t help in this condition. Age is the most important feature given by XGBoost and Random Forest algorithm, which means that it is a strong predictor for the outcome. This is meaningful because other than age, there are no many other information provided when users creating their accounts. Meanwhile, from my relevance analysis, I saw that people who are within different age intervals are prone to going to different countries. Session count is another good predictor, which is the total number of actions when new users browse on the app. Date of account creating and timestamp of first active day are also important predictors that affect the outcome greatly. The weight of gender to be a predictor is less significant, which means that there are no difference for males and females in choosing their destinations. 