In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# data loading
train = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
age_gender = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')
countries_df = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
session_df = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')

print('train shape: ', train.shape) 
print('test shape: ', test.shape)
print('age_gender_bkts: ', age_gender.shape)
print('countries: ', countries_df.shape)
print('session: ', session_df.shape)

# EDA

In [None]:
train.head()

In [None]:
print(train.date_account_created.min())
print(train.date_account_created.max())

In [None]:
train.info()

Implement glimpse (R) Function to take a look over the nulls counts, percentages, and dtypes. 

In [None]:
# https://gist.github.com/sainathadapa/08c1028c92684fe1ec89ecb5d5629a57
def glimpse(df, maxvals=10, maxlen=110):
    print('Shape: ', df.shape)
    
    def pad(y):
        max_len = max([len(x) for x in y])
        return [x.ljust(max_len) for x in y]
    
    # Column Name
    toprnt = pad(df.columns.tolist())
    
    # Column Type
    toprnt = pad([toprnt[i] + ' ' + str(df.iloc[:,i].dtype) for i in range(df.shape[1])])
    
    # Num NAs
    num_nas = [df.iloc[:,i].isnull().sum() for i in range(df.shape[1])]
    num_nas_ratio = [int(round(x*100/df.shape[0])) for x in num_nas]
    num_nas_str = [str(x) + ' (' + str(y) + '%)' for x,y in zip(num_nas, num_nas_ratio)]
    max_len = max([len(x) for x in num_nas_str])
    num_nas_str = [x.rjust(max_len) for x in num_nas_str]
    toprnt = [x + ' ' + y + ' NAs' for x,y in zip(toprnt, num_nas_str)]
    
    # Separator
    toprnt = [x + ' : ' for x in toprnt]
    
    # Values
    toprnt = [toprnt[i] + ', '.join([str(y) for y in df.iloc[:min([maxvals,df.shape[0]]), i]]) for i in range(df.shape[1])]
    
    # Trim to maxlen
    toprnt = [x[:min(maxlen, len(x))] for x in toprnt]
    
    for x in toprnt:
        print(x)

In [None]:
glimpse(train)

In [None]:
glimpse(test)

#### Explore values in some variables

First, lets check if there are any duplicates in the data

In [None]:
train.first_affiliate_tracked.value_counts()

In [None]:
plt.figure(figsize=(14,8))
cd_count_idx = train['first_affiliate_tracked'].value_counts().index
sns.countplot(data = train, x = 'first_affiliate_tracked', order = cd_count_idx, color = sns.color_palette()[3])
plt.xlabel('First Affiliate Tracked')
plt.ylabel('Count')
plt.title('First Affiliate Distribution')
cd_count_val = train['first_affiliate_tracked'].value_counts()

for i in range(cd_count_val.shape[0]):
    count = cd_count_val[i]
    percentage ='{:0.1f}%'.format(100 * count / len(train))
    plt.text(i, count+1000, percentage, ha='center')

In [None]:
train.affiliate_provider.value_counts()

In [None]:
plt.figure(figsize=(14,8))
cd_count_idx = train['affiliate_provider'].value_counts()[:6].index
sns.countplot(data = train, x = 'affiliate_provider', order = cd_count_idx, color = sns.color_palette()[3])
plt.xlabel('Affiliate Provider')
plt.ylabel('Count')
plt.title('Affiliate Provider Distribution')
cd_count_val = train['affiliate_provider'].value_counts()[:6]

for i in range(cd_count_val.shape[0]):
    count = cd_count_val[i]
    percentage ='{:0.1f}%'.format(100 * count / len(train))
    plt.text(i, count+1000, percentage, ha='center')

In [None]:
train[train.duplicated()]

No full row duplicates

In [None]:
train['id'].describe()

We can see that every user has only one unique record.

In [None]:
train['gender'].value_counts()

In [None]:
train['age'].describe()

Age seems to have some outliers, so we have to handle it.
* minimum age to create an acount in airbnb is 18 years.
* maximum age for a human is 122.  
we will set our min and max to these values.

In [None]:
sns.histplot(train['age'], kde = True)

In [None]:
train['age'] = train['age'].apply(lambda x: 122 if x > 122 else x)
train['age'] = train['age'].apply(lambda x: 18 if x < 18 else x)

In [None]:
test['age'].describe()

In [None]:
test['age'] = test['age'].apply(lambda x: 122 if x > 122 else x)
test['age'] = test['age'].apply(lambda x: 18 if x < 18 else x)

In [None]:
sns.histplot(train['age'], kde = True)

In [None]:
train['signup_flow'].value_counts()

In [None]:
train['signup_method'].value_counts()

In [None]:
train[train.duplicated()]

### Target Variable

In [None]:
plt.figure(figsize=(14,8))
cd_count_idx = train['country_destination'].value_counts().index
sns.countplot(data = train, x = 'country_destination', order = cd_count_idx, color = sns.color_palette()[3])
plt.xlabel('Destination Country')
plt.ylabel('Count')
plt.title('Destination Country Distribution')
cd_count_val = train['country_destination'].value_counts()

for i in range(cd_count_val.shape[0]):
    count = cd_count_val[i]
    percentage ='{:0.1f}%'.format(100 * count / len(train))
    plt.text(i, count+1000, percentage, ha='center')

### Sessions data

In [None]:
session_df.head(10)

In [None]:
session_df.sample(20)

In [None]:
session_df['device_type'].value_counts()

In [None]:
plt.figure(figsize=(14,8))
cd_count_idx = session_df['device_type'].value_counts()[:5].index
sns.countplot(data = session_df, x = 'device_type', order = cd_count_idx, color = sns.color_palette()[3])
plt.xlabel('Device')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.title('Devices Type Distribution')
cd_count_val = session_df['device_type'].value_counts()[:5]

In [None]:
session_df['action_detail'].unique()

In [None]:
session_df['action_detail'].value_counts()[:10]

In [None]:
view_search_time = session_df[session_df.action_detail == 'view_search_results']
view_search_time

In [None]:
sns.distplot(view_search_time['secs_elapsed'], kde = True)

### Apply One-Hot Encoding to Categorical variables

In [None]:
labels = train['country_destination']
train.drop('country_destination', inplace = True, axis = 1)

In [None]:
# concat all data into one dataframe for the encoding
data = pd.concat((train, test), axis=0, ignore_index=True)
data = data.drop(['id', 'date_first_booking'], axis=1)

In [None]:
#One-hot-encoding features
cat_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
             'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in cat_features:
    data_dummy = pd.get_dummies(data[f], prefix=f) # encode categorical variables
    data.drop([f], axis=1, inplace = True) # drop encoded variables
    data = pd.concat((data, data_dummy), axis=1) # concat numerical and categorical variables

In [None]:
data.head()

# Feature Engineering

We don't need the date_first_booking variable as it has more than 50% nulls in train and does not exist in test

In [None]:
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [None]:
# convert date object to datetime
data['date_account_created'] = pd.to_datetime(data['date_account_created'])

In [None]:
# create features from date account created.
data['ac_year'] = data['date_account_created'].dt.year
data['ac_month'] = data['date_account_created'].dt.month
data['ac_day'] = data['date_account_created'].dt.day

In [None]:
# TODO:: Plot holidays effect on the booking or creating accounts.
##       plot affiliation vs age

In [None]:
data.drop('date_account_created', inplace = True, axis = 1)

In [None]:
data['timestamp_first_active'] = pd.to_datetime(data['timestamp_first_active'])

In [None]:
# create features from timestamp first active.
data['ts_fa_year'] = data['timestamp_first_active'].dt.year
data['ts_fa_month'] = data['timestamp_first_active'].dt.month
data['ts_fa_day'] = data['timestamp_first_active'].dt.day

In [None]:
data.drop('timestamp_first_active', inplace = True, axis = 1)

In [None]:
# label encode the target value
le = LabelEncoder()
y = le.fit_transform(labels)

# Modeling

In [None]:
# split the data to train and test again
#Splitting train and test
X = data[:train.shape[0]]
X_test = data[train.shape[0]:]

In [None]:
from xgboost import XGBClassifier, plot_importance
xgb = XGBClassifier(use_label_encoder=False)                  
xgb.fit(X, y)
##TODO:: Use GridSearch

In [None]:
y_pred = xgb.predict_proba(X_test)

### Feature Importance

In [None]:
plot_importance(xgb, max_num_features=15)

In [None]:
# get the top 5 prob countries
ids = []  
countries = [] 
test_id = test['id']

In [None]:
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5 # skip 5 indicies as we predict 5 locations for each user
    countries += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Submission

In [None]:
#Generate submission
submission = pd.DataFrame(np.column_stack((ids, countries)), columns=['id', 'country'])
submission.to_csv('submission.csv',index=False)

In [None]:
submission