In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing important libraries

In [None]:
# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

# Data types manipulation
from datetime import datetime

# Machine learning Modeliing
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

## Importing Data

In [None]:
train_data= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test_data= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
sample_submission= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sample_submission_NDF.csv.zip')

In [None]:
sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
sessions.rename(columns = {'user_id': 'id'}, inplace=True)
sessions.head()

## Data Exploration

In [None]:
train_data.head()

# numerical and categorical cols

In [None]:
num_cols = list(set(train_data.describe().columns) - set(['id']))
cat_cols = list(set(train_data.columns) - set(num_cols) - set(['id']))

In [None]:
print("The shape of training data is: ",train_data.shape)
print("The shape of testing data is: ",test_data.shape)

In [None]:
train_data.info()

# Some Notes from the datatypes of the columns
## Columns like : 
## 1. date_account_created  , 
## 2.                timestamp_first_active    ,

## =========> Are dates but not in the date format 
# The solution 
## We will transform their types into datetime

In [None]:
# Converting the timestamps
# Convert in the training 
train_data['date_account_created'] = pd.to_datetime(train_data['date_account_created'])
train_data['date_first_booking'] = pd.to_datetime(train_data['date_first_booking'])
train_data['timestamp_first_active'] = pd.to_datetime(train_data['timestamp_first_active'], format='%Y%m%d%H%M%S')

# Convert in the training 
test_data['date_account_created'] = pd.to_datetime(test_data['date_account_created'])
test_data['date_first_booking'] = pd.to_datetime(test_data['date_first_booking'])
test_data['timestamp_first_active'] = pd.to_datetime(test_data['timestamp_first_active'], format='%Y%m%d%H%M%S')

## Investigating the Null Values

In [None]:
train_data.isnull().sum()/len(train_data)

In [None]:
#Null value Analysis percejtage
plt.figure(figsize = (20,8))
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')
train_data.isnull().sum()

print('Date first booking null value percentage : ',(train_data['date_first_booking'].isnull().sum()/len(train_data['date_first_booking'].isnull())*100),'%\n')
print('Age null value percentage : ',(train_data['age'].isnull().sum()/len(train_data['age'].isnull())*100),'%\n')
print('first_affiliate_tracked null value percentage : ',(train_data['first_affiliate_tracked'].isnull().sum()/len(train_data['first_affiliate_tracked'].isnull())*100),'%\n')

# Those Columns has high Nulls percentage :
## date_first_booking         0.583473  =====> this col will be dropped at the modelling stage
## age                        0.412226

# Exploring numerical Cols
## strat with the age col

In [None]:
print("the minimum age is",train_data.age.min())
print("the maximum age is" , train_data.age.max())

## Those Values are impossible to be TRUE 
## These are entered incorrect by the users


In [None]:
(train_data.age == 2014).value_counts()

In [None]:
(train_data.age > 100).value_counts()

## There are more than 2000 customer older than 100 years

In [None]:
(train_data.age > 120).value_counts()

In [None]:
print(train_data['age'].describe())

plt.figure(figsize=(12,6))
sns.distplot(train_data.age.dropna())
plt.title('Age Distrubtion')
plt.show()

In [None]:
#Excluding nan, less than 15 and grater than 95 values
data_a = train_data
data_a.loc[(train_data.age > 105) | (train_data.age < 15), 'age'] = np.nan
plt.figure(figsize=(12,6))
sns.distplot(data_a.age.dropna())
plt.title('Age Distrubtion excluding nan and outliers (keeping only ages between 15 to 95)')
plt.show()

data_a.loc[(train_data.age > 105) | (train_data.age < 15), 'age'] = np.nan
plt.figure(figsize=(12,6))
sns.boxplot(x='country_destination',y='age', data=data_a.dropna())
plt.xlabel('destination countries')
plt.title('Age vs. destination countries')
plt.legend()
plt.show()

data_a.loc[(train_data.age > 105) | (train_data.age < 15), 'age'] = np.nan
plt.figure(figsize=(12,6))
sns.violinplot(x='country_destination',y='age', data=data_a.dropna())
plt.xlabel('destination countries')
plt.title('Age vs. destination countries')
plt.legend()
plt.show();


In [None]:
data_a.age.describe()

In [None]:
plt.figure(figsize = (20,6))
sns.boxplot(data_a['age']);

## Plotting Age with each destination

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(30, 50))
i=0
j=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for country in train_data['country_destination'].unique():
    axes[i, j].hist(train_data[(train_data['age']<=100) & (train_data['country_destination']== country)]['age'], bins=80)
    axes[i, j].set_title('age_distribution_in_{}'.format(country))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
plt.xticks()
plt.show()

## This was a trial to get an insight from age corresponding to the destination
   ### Example, The mean Avg Age to Visitors to Italy is about 37 Years old

## NL has some visitors in the Age of 90s that might be for a specific goal like medical trip

## We can see that all the distributions are right skewed with a mean around 40 


# Finding Nulls and Handle them

In [None]:
data_a.age.isnull().sum()/len(data_a)

## Filling Nulls data in different techniques

In [None]:
# train_data['age']= train_data['age'].fillna(train_data['age'].mean())
# test_data['age']= test_data['age'].fillna(train_data['age'].mean())

In [None]:
train_data['age']= train_data['age'].fillna(train_data['age'].median())
test_data['age']= test_data['age'].fillna(train_data['age'].median())

## I choose to fill with median because it's robust with outliers 

# Exploring the categorical values

## From a high Level

In [None]:
#add colores for plot bars
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] 

In [None]:
#Unique value analysis

print('gender : ',len(train_data[train_data['gender']!=np.nan]['gender'].unique()),'\n') # Contain unknown value

print('signup_method : ',len(train_data[train_data['signup_method']!=np.nan]['signup_method'].unique()),'\n') 

print('language : ',len(train_data[train_data['language']!=np.nan]['language'].unique()),'\n')

print('affiliate_channel : ',len(train_data[train_data['affiliate_channel']!=np.nan]['affiliate_channel'].unique()),'\n')

print('affiliate_provider : ',len(train_data[train_data['affiliate_provider']!=np.nan]['affiliate_provider'].unique()),'\n')

print('first_affiliate_tracked : ',len(train_data[train_data['first_affiliate_tracked']!=np.nan]['first_affiliate_tracked'].unique()),'\n') #Contain NAN 

print('signup_app : ',len(train_data[train_data['signup_app']!=np.nan]['signup_app'].unique()),'\n')

print('first_device_type : ',len(train_data[train_data['first_device_type']!=np.nan]['first_device_type'].unique()),'\n')

print('first_browser : ',len(train_data[train_data['first_browser']!=np.nan]['first_browser'].unique()),'\n') #-unknown-

print('signup_flow : ',len(train_data[train_data['signup_flow']!=np.nan]['signup_flow'].unique()),'\n')

print('country_destination (Target Variable) : ',len(train_data[train_data['country_destination']!=np.nan]['country_destination'].unique()),'\n')


# Sign-up methods used by users

In [None]:
train_data['signup_method'].value_counts()

In [None]:
plt.figure(figsize = (15,8))
train_data.signup_method.value_counts(dropna=False).plot(kind='bar', color = colors);


1) Majority of users either signup from basic or facebook.

2) Basic's share is more than double of facebook.

3) Googles's share is negligible.

# Users Devices

In [None]:
train_data['first_device_type'].value_counts()

In [None]:
plt.figure(figsize = (15,8))
train_data.first_device_type.value_counts(dropna=False).plot(kind='bar', color=colors)

## Checking the output

In [None]:
# destination country share

plt.figure(figsize=(20,10))

sns.set(style="darkgrid")

ax = sns.countplot(x="country_destination", data=train_data)

plt.title("country_destination")

plt.show()

1) The Dataset is highly Imbalanced.

2) Majority of users didnt do any booking or travelled to the US only.

## We can see that the output is imbalanced 
### So we will try to : Oversample using SMOTE or choosing to perform class weights

## The imbalanced datasets need special treatement in handeling and be careful at the choosing the performance metrics

# Exploring the Signup flow column

In [None]:
# signup_flow share

plt.figure(figsize=(20,10))

sns.set(style="darkgrid")

ax = sns.countplot(x="signup_flow", data=train_data)

plt.title("signup_flow")

plt.show()

1) Majority of users' signup flow is 0.

2) Other than 1,2,3,12,23,24 and 25, all others have negligible share.

# Exploring Categorical

## From low level with respect to the outliers

In [None]:
cat_list= ['gender', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser']

In [None]:
# creating a list of dataframes with aggregations between the country destinations and another categorical columns
ls_of_column_counts_df_lists= []

for column in cat_list[0:10]:
    
    grouped_tab= train_data[['country_destination', column,'id']].groupby(['country_destination', column]).count()
    grouped_tab= grouped_tab.rename(columns={'id':'{}_count'.format(column)})
    grouped_tab= grouped_tab.reset_index()
    column_counts_df_list= []
    
    for country in train_data['country_destination'].unique():
        
        grouped_tab_country= grouped_tab[grouped_tab['country_destination']== country]
        column_counts_df_list.append(grouped_tab_country)
    ls_of_column_counts_df_lists.append(column_counts_df_list)

In [None]:
ls_of_column_counts_df_lists[1][0]

In [None]:
ls_of_column_counts_df_lists[2][0]

# Visualizing the Bivariate 

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[0]:
    if len(ls_of_column_counts_df_lists[0][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color =colors)
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

## I tried to extract more information as I could but from that was hard so -----> decided why not to combine all aspects in one grapgh

In [None]:
# gender share

plt.figure(figsize=(20,10))

sns.set(style="darkgrid")

ax = sns.countplot(x="gender", hue="country_destination", data=train_data)

plt.title("Gender vs Destination")

plt.show()

In [None]:
# signup_method share

plt.figure(figsize=(20,10))

sns.set(style="darkgrid")

ax = sns.countplot(x="signup_method", hue="country_destination", data=train_data)

plt.title("signup_method vs Destination")

plt.show()

In [None]:
# signup_app share

plt.figure(figsize=(20,10))

sns.set(style="darkgrid")

ax = sns.countplot(x="signup_app", hue="country_destination", data=train_data)

plt.title("signup_app vs Destination")

plt.show()

# Generating new Features

## 1. create new feature from the subtarction of Date account created and first 

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data['time_btwn_created_booking'] = train_data['date_first_booking'].sub(train_data['date_account_created'], axis=0)
train_data.head(10)         

In [None]:
train_data['time_btwn_created_booking'].describe()

In [None]:
train_data['time_btwn_created_booking'][10].days

## Generating features from the datetime 
### The essentials outputs from timeseries like the Quarter and year and month ===> to gain insights about the best time they recieve new accounts and more bookings
### That might be affected by a marketing campaign that came before the increase or it is related to the consumers behavior and customers preffered some specific time in the year or quarter or month to travel new quantries and use the APP.

In [None]:
train_data['month_first_book'] = pd.DatetimeIndex(train_data['date_first_booking']).month
train_data['quarter_first_book']= pd.DatetimeIndex(train_data['date_first_booking']).quarter
train_data['year_first_book']= pd.DatetimeIndex(train_data['date_first_booking']).year
train_data['DayOfWeek_first_book']= pd.DatetimeIndex(train_data['date_first_booking']).weekday


train_data['month_account_created'] = pd.DatetimeIndex(train_data['date_account_created']).month
train_data['quarter_account_created']= pd.DatetimeIndex(train_data['date_account_created']).quarter
train_data['year_account_created']= pd.DatetimeIndex(train_data['date_account_created']).year
train_data['DayOfWeek_account_created']= pd.DatetimeIndex(train_data['date_account_created']).weekday


test_data['month_first_book'] = pd.DatetimeIndex(test_data['date_first_booking']).month
test_data['quarter_first_book']= pd.DatetimeIndex(test_data['date_first_booking']).quarter
test_data['year_first_book']= pd.DatetimeIndex(test_data['date_first_booking']).year
test_data['DayOfWeek_first_book']= pd.DatetimeIndex(test_data['date_first_booking']).weekday


test_data['month_account_created'] = pd.DatetimeIndex(test_data['date_account_created']).month
test_data['quarter_account_created']= pd.DatetimeIndex(test_data['date_account_created']).quarter
test_data['year_account_created']= pd.DatetimeIndex(test_data['date_account_created']).year
test_data['DayOfWeek_account_created']= pd.DatetimeIndex(test_data['date_account_created']).weekday



In [None]:
train_data.shape

In [None]:
train_data.isnull().sum()

## 1. Visualizing Years

In [None]:
plt.figure(figsize = (15,8))

plt.hist(train_data['year_first_book'], color = 'skyblue', bins= 5)

plt.title("The frequancy of Years ");

## the biggest year is 2014 === > that might be because our data is until 2014

## 1. Visualizing Monthes

In [None]:
plt.figure(figsize = (15,8))

plt.hist(train_data['month_first_book'], color = 'skyblue', bins = 12)

plt.title("The frequancy of Months ");

## the best monthes are the mid year monthes like 4,5,6 

In [None]:
plt.figure(figsize = (10,8))

plt.hist(train_data['quarter_first_book'], color = 'skyblue', bins = 4)

plt.title("The frequancy of Quarters ");

## It was clear from the monthes that it's preferred by customers to make travel trips in the second Quarter monthes 4,5,6

In [None]:
plt.figure(figsize = (10,8))

plt.hist(train_data['DayOfWeek_first_book'], color = 'skyblue', bins = 7)

plt.title("The frequancy of Days ");

## Filling the missing vaues in the columns generated 
### thought about median , mode
### MEdian didn't do well because it will change the datatype of the cols so decided to choose mode

In [None]:
# filling the nan values by the mode:
train_data['month_first_book']= train_data['month_first_book'].fillna(train_data['month_first_book'].mode()[0])
train_data['quarter_first_book']= train_data['quarter_first_book'].fillna(train_data['quarter_first_book'].mode()[0])
train_data['DayOfWeek_first_book']= train_data['DayOfWeek_first_book'].fillna(train_data['DayOfWeek_first_book'].mode()[0])
train_data['year_first_book']= train_data['year_first_book'].fillna(train_data['year_first_book'].mode()[0])

test_data['month_first_book']= test_data['month_first_book'].fillna(train_data['month_first_book'].mode()[0])
test_data['quarter_first_book']= test_data['quarter_first_book'].fillna(train_data['quarter_first_book'].mode()[0])
test_data['DayOfWeek_first_book']= test_data['DayOfWeek_first_book'].fillna(train_data['DayOfWeek_first_book'].mode()[0])
test_data['year_first_book']= test_data['year_first_book'].fillna(train_data['year_first_book'].mode()[0])


## Dropping datetime columns 

In [None]:
train_data.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking', 'time_btwn_created_booking'], inplace= True)

test_data.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking'], inplace= True)

# Here is the end of the EDA 

  # Let's start the Pre-processing

# Merging the train and test data

In [None]:
df = pd.concat((train_data, test_data), axis=0, ignore_index=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.drop(columns=['year_first_book', 'year_account_created'], inplace= True)


In [None]:
num_cols = list(set(df.describe().columns) - set(['id']))
cat_cols = list(set(df.columns) - set(num_cols) - set(['id']))


print("numerical cols are : ", num_cols)
print("categorical cols are : ",cat_cols)

In [None]:
df.dropna(axis = 0 , inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# # Dividing data into X and y
# X = df.drop(['country_destination'], axis = 1)
# y = df['country_destination']

In [None]:
# print(X.shape)
# print(y.shape)

# Encoding categorical

In [None]:
categorical_features = ['gender', 'signup_method', 'signup_flow', 'language','affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 
                        'first_device_type', 'first_browser']
df= pd.get_dummies(df, columns=categorical_features)


In [None]:
df.shape

In [None]:
df.set_index('id', inplace=True)

In [None]:
# Dividing data into X and y
X = df.drop(['country_destination'], axis = 1)
y = df['country_destination']


from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y) #Transforming the target variable using labels
encoded_y

# The numericals
## Age col needs to be standardized

# Standardizing Age

In [None]:
#Standardizing the age column:
from sklearn.preprocessing import StandardScaler

st_cols= X[['age']]
st_cols= pd.DataFrame(StandardScaler().fit_transform(st_cols), columns=['age'] )


In [None]:
X['age'] = st_cols.values

In [None]:
X['age']

# Exploring sessions dataset

## Sessions dataset is a table saving the action , device and the time spent on the platform 
### Those data are so important to be joing with the full dataset to feed the model with much more related data

In [None]:
print(sessions.shape)
sessions.head()

# Oversampling using smote 

## It's not recommended to use smote model in high dimentions data and 

In [None]:
# from imblearn.over_sampling import SMOTE 
# sm = SMOTE(random_state = 2)

# X1, y1 = sm.fit_resample(X, y)

# Features selection with statistical tests 
## Chi square

In [None]:
# from sklearn.feature_selection import chi2
# from sklearn.feature_selection import SelectKBest

# test = SelectKBest(score_func=chi2, k=20)
# fit = test.fit(X, y)
# fit.scores_


# Training models

# Our problem is multiclass classification
## I was thinking about using {RandomForest Classifier ==> for it's interpretability, XGboost ==> for it's multi-threading, and Tree based}

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.25, random_state=42)

In [None]:
# Modeling
from xgboost import XGBClassifier, plot_importance
xgb = XGBClassifier(use_label_encoder=False)                  
xgb.fit(X_train, y_train)

In [None]:
param = {}
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param["eval_metric"] = "error"
param['eta'] = 0.3
param['gamma'] = 0
param['max_depth'] = 6
param['min_child_weight']=1
param['max_delta_step'] = 0
param['subsample']= 1
param['colsample_bytree']=1
param['silent'] = 1
param['seed'] = 0
param['base_score'] = 0.5

clf = XGBClassifier(param)
clf.fit(X_train, y_train)

In [None]:
y_pred2 = clf.predict_proba(X_test)

In [None]:
y_pred = xgb.predict_proba(X_test)

In [None]:
y_test[:10]

In [None]:
y_pred2[:10]

In [None]:
plot_importance(clf, max_num_features=10)
f_importances= clf.feature_importances_

In [None]:
plot_importance(xgb, max_num_features=10)
f_importances= xgb.feature_importances_

# Testing model

In [None]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(range(12))

y_test_enc = lb.transform(y_test)


In [None]:
from sklearn import metrics
model_1_score= metrics.ndcg_score(y_true= y_test_enc,
                           y_score= y_pred2,
                           k=5, sample_weight=None, ignore_ties=False)
model_1_score