In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
test_x= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
sample_submission= pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sample_submission_NDF.csv.zip')

Inspecting the train_data:

In [None]:
train_data.columns

1- change the timestamp to date

2- inspect the NAN values in date_first_booking and see whether they always correspond to NDF country destination.

3- inspect the NAN values in all columns and see their percentage

4- inspect the unique values of the categorical features in the data

In [None]:
#converting from timestamp to date:
train_data['date_account_created'] = pd.to_datetime(train_data['date_account_created'])
train_data['timestamp_first_active'] = pd.to_datetime(train_data['timestamp_first_active'], format='%Y%m%d%H%M%S')

test_x['date_account_created'] = pd.to_datetime(test_x['date_account_created'])
test_x['timestamp_first_active'] = pd.to_datetime(test_x['timestamp_first_active'], format='%Y%m%d%H%M%S')

In [None]:
#Finding the number of null entries in each column.
for col in train_data.columns:
    num_null_values = train_data[col].isnull().sum()
    if num_null_values != 0:
        print(col + " has {} null values.".format(num_null_values))
        print()

In [None]:
#Finding the number of null entries in each column.
for col in test_x.columns:
    num_null_values = test_x[col].isnull().sum()
    if num_null_values != 0:
        print(col + " has {} null values.".format(num_null_values))
        print()

### Since the null values in age represent 41% of the data, and the null values in date_first_booking represent 58% of the data, then we can either drop the 2 columns, or keep them as they are and use a tree-based algorithm that can take the null values as a group and make use of them

## Feature Engineering

#### Get the year, month, day, quarter from the date columns: date_account_created, timestamp_first_active, date_first_booking

In [None]:
train_data['month_first_book'] = pd.DatetimeIndex(train_data['date_first_booking']).month
train_data['quarter_first_book']= pd.DatetimeIndex(train_data['date_first_booking']).quarter
train_data['DayOfWeek_first_book']= pd.DatetimeIndex(train_data['date_first_booking']).weekday

train_data['month_account_created'] = pd.DatetimeIndex(train_data['date_account_created']).month
train_data['quarter_account_created']= pd.DatetimeIndex(train_data['date_account_created']).quarter
train_data['DayOfWeek_account_created']= pd.DatetimeIndex(train_data['date_account_created']).weekday


test_x['month_first_book'] = pd.DatetimeIndex(test_x['date_first_booking']).month
test_x['quarter_first_book']= pd.DatetimeIndex(test_x['date_first_booking']).quarter
test_x['DayOfWeek_first_book']= pd.DatetimeIndex(test_x['date_first_booking']).weekday

test_x['month_account_created'] = pd.DatetimeIndex(test_x['date_account_created']).month
test_x['quarter_account_created']= pd.DatetimeIndex(test_x['date_account_created']).quarter
test_x['DayOfWeek_account_created']= pd.DatetimeIndex(test_x['date_account_created']).weekday

In [None]:
def weekend(weekday):
    if weekday == 5 or weekday == 6:
        return True
    else:
        return False

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays()

train_data['Holiday_first_book'] = train_data['date_first_booking'].isin(holidays)
train_data['Holiday_account_created'] = train_data['date_account_created'].isin(holidays)

# the column Weekend will contain true for (saturdays and sundays), and will return false otherwise
#because we think that the electricity consumption may be affected by the weekends
train_data['Weekend_first_book'] = train_data['DayOfWeek_first_book'].map(weekend)
train_data['Weekend_account_created'] = train_data['DayOfWeek_account_created'].map(weekend)

test_x['Holiday_first_book'] = test_x['date_first_booking'].isin(holidays)
test_x['Holiday_account_created'] = test_x['date_account_created'].isin(holidays)

test_x['Weekend_first_book'] = test_x['DayOfWeek_first_book'].map(weekend)
test_x['Weekend_account_created'] = test_x['DayOfWeek_account_created'].map(weekend)

In [None]:
train_data['Holiday_first_book'].sum()
#since there is no variance over this column,we can drop it

In [None]:
train_data.drop(columns=['Holiday_first_book'], inplace=True)
test_x.drop(columns=['Holiday_first_book'], inplace=True)

### Plotting the distribution of age with each destination country

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(30, 50))
i=0
j=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for country in train_data['country_destination'].unique():
    axes[i, j].hist(train_data[(train_data['age']<=100) & (train_data['country_destination']== country)]['age'], bins=80)
    axes[i, j].set_title('age_distribution_in_{}'.format(country))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
plt.xticks()
plt.show()

### The distribution of age does not significantly vary from one distination country to the other, however, we will not decide the significance of this column now, we will try a ML model that determines the significance of each column and then we perform feature selction based on these results

### Here, we will try to inspect the categorical columns and see the unique values in each column

In [None]:
cat_list= ['gender', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'month_first_book',
       'quarter_first_book', 'DayOfWeek_first_book', 'month_account_created',
       'quarter_account_created', 'DayOfWeek_account_created',
       'Holiday_account_created', 'Weekend_first_book',
       'Weekend_account_created', 'country_destination']
for col in cat_list:
    print(col + ' has unique values {}'.format(train_data[col].unique()) 
          + ' which are {} categories'.format(len(train_data[col].unique())))
    print()

In [None]:
ls_of_column_counts_df_lists= []
for column in cat_list[0:19]:
    grouped_tab= train_data[['country_destination', column,'id']].groupby(['country_destination', column]).count()
    grouped_tab= grouped_tab.rename(columns={'id':'{}_count'.format(column)})
    grouped_tab= grouped_tab.reset_index()
    column_counts_df_list= []
    for country in train_data['country_destination'].unique():
        grouped_tab_country= grouped_tab[grouped_tab['country_destination']== country]
        column_counts_df_list.append(grouped_tab_country)
    ls_of_column_counts_df_lists.append(column_counts_df_list)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[0]:
    if len(ls_of_column_counts_df_lists[0][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### From the above plots, we can say that gender might be a factor that affects the destination country

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[1]:
    if len(ls_of_column_counts_df_lists[1][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### from the above plots we can see that the signup method does not significantly vary from on destination to the other, however we will not drop it, we will wait until the ML model gives us its significance and then decide

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[2]:
    if len(ls_of_column_counts_df_lists[2][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the Signup_flow graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(30, 50))
i=0
j=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[3]:
    axes[i, j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i, j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i, j].set_xlabel(''.format(df.columns[1]))
    axes[i, j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the langauge graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(30, 50))
i=0
j=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[4]:
    axes[i, j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i, j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i, j].set_xlabel(''.format(df.columns[1]))
    axes[i, j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the Affiliate_Channel graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=12, ncols=1, figsize=(30, 90))
i=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[5]:
    axes[i].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i].set_xlabel(''.format(df.columns[1]))
    axes[i].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    i+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the Affiliate_Provider graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(25, 20))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[6]:
    if len(ls_of_column_counts_df_lists[6][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the first_affiliate_tracked graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[7]:
    if len(ls_of_column_counts_df_lists[7][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the Signup_app graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(35, 50))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[8]:
    if len(ls_of_column_counts_df_lists[8][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the First_Device_Type graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=12, ncols=1, figsize=(60, 100))
i=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[9]:
    axes[i].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i].set_xlabel(''.format(df.columns[1]))
    axes[i].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    i+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the first_Browser graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[10]:
    if len(ls_of_column_counts_df_lists[10][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### From the above Graphswe can see that the Month_First_Book distribution does not significantly vary over most of the destinations instead of the the AU destination which has a k-shaped dstribution

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[11]:
    if len(ls_of_column_counts_df_lists[11][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The distribution of the Year_first_Book graphs does not vary from one destination to the other, however, we will not drop it, we will wait until the ML Model gives us its significance

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[12]:
    if len(ls_of_column_counts_df_lists[12][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

### The Quarter_First_Book count does not vary from one destination to the other instead of the AU distribution where the graph has a k-shape

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[13]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

### The Month_Account_Created count does not vary from one destination to the other instead of the AU distribution where the graph has a k-shap

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 12))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[14]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

### The Quarter_First_Book count does not vary from one destination to the other

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[15]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

### The Quarter_Account_Created count does not vary from one destination to the other instead of the AU distribution where the graph has a k-shape

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 10))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[16]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 10))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[17]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 10))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[18]:
    #if len(ls_of_column_counts_df_lists[16][i])>0:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<3):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
train_data.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking'], inplace= True)

test_x.drop(columns=['date_account_created', 'timestamp_first_active', 'date_first_booking'], inplace= True)
test_x

## We need to merge the categories that rarely occure in the high cardinality columns like: signup_flow , language , affiliate_provider, first_browser , so that they become in one category called "others" for example

In [None]:
#signup_flow
signup_flow_cats_Counts= train_data[['signup_flow', 'id']].groupby('signup_flow').count().sort_values(by= 'id',ascending=False)
signup_flow_cats_Counts= signup_flow_cats_Counts.rename(columns={'id': 'signup_flow_counts'})
signup_flow_cats_Counts['Percentage_Contribution']= signup_flow_cats_Counts['signup_flow_counts']/ len(train_data)
signup_flow_cats_Counts['accumulative_perc']=100 * (signup_flow_cats_Counts['signup_flow_counts'].cumsum()/
                                                    signup_flow_cats_Counts['signup_flow_counts'].sum()) 
signup_flow_cats_Counts= signup_flow_cats_Counts.reset_index()
signup_flow_freq_cats= signup_flow_cats_Counts[signup_flow_cats_Counts['accumulative_perc']<=99.1]['signup_flow'].tolist()

#language
language_cats_Counts= train_data[['language', 'id']].groupby('language').count().sort_values(by= 'id',ascending=False)
language_cats_Counts= language_cats_Counts.rename(columns={'id': 'language_counts'})
language_cats_Counts['Percentage_Contribution']= language_cats_Counts['language_counts']/ len(train_data)
language_cats_Counts['accumulative_perc']=100 * (language_cats_Counts['language_counts'].cumsum()/
                                             language_cats_Counts['language_counts'].sum()) 
language_cats_Counts= language_cats_Counts.reset_index()
language_freq_cats= language_cats_Counts[language_cats_Counts['accumulative_perc']<=99.1]['language'].tolist()

#affiliate_provider
affiliate_provider_cats_Counts= train_data[['affiliate_provider', 'id']].groupby('affiliate_provider').count().sort_values(by= 'id',ascending=False)
affiliate_provider_cats_Counts= affiliate_provider_cats_Counts.rename(columns={'id': 'affiliate_provider_counts'})
affiliate_provider_cats_Counts['Percentage_Contribution']= affiliate_provider_cats_Counts['affiliate_provider_counts']/ len(train_data)
affiliate_provider_cats_Counts['accumulative_perc']=100 * (affiliate_provider_cats_Counts['affiliate_provider_counts'].cumsum()/
                                                           affiliate_provider_cats_Counts['affiliate_provider_counts'].sum()) 
affiliate_provider_cats_Counts=affiliate_provider_cats_Counts.reset_index()
affiliate_provider_freq_cats= affiliate_provider_cats_Counts[affiliate_provider_cats_Counts['accumulative_perc']<=99.1]['affiliate_provider'].tolist()

#first_browser
first_browser_cats_Counts= train_data[['first_browser', 'id']].groupby('first_browser').count().sort_values(by= 'id',ascending=False)
first_browser_cats_Counts= first_browser_cats_Counts.rename(columns={'id': 'first_browser_counts'})
first_browser_cats_Counts['Percentage_Contribution']= first_browser_cats_Counts['first_browser_counts']/ len(train_data)
first_browser_cats_Counts['accumulative_perc']=100 * (first_browser_cats_Counts['first_browser_counts'].cumsum()/
                                                      first_browser_cats_Counts['first_browser_counts'].sum()) 
first_browser_cats_Counts= first_browser_cats_Counts.reset_index()
first_browser_freq_cats= first_browser_cats_Counts[first_browser_cats_Counts['accumulative_perc']<=99.1]['first_browser'].tolist()

In [None]:
def signup_flow_Cats_merger(category):
    if category not in signup_flow_freq_cats:
        category= 'other'
    return str(category)
def language_Cats_merger(category):
    if category not in language_freq_cats:
        category= 'other'
    return category
def affiliate_provider_Cats_merger(category):
    if category not in affiliate_provider_freq_cats:
        category= 'other'
    return category
def first_browser_Cats_merger(category):
    if category not in first_browser_freq_cats:
        category= 'other'
    return category

In [None]:
train_data['signup_flow']= train_data['signup_flow'].map(signup_flow_Cats_merger)
train_data['language']= train_data['language'].map(language_Cats_merger)
train_data['affiliate_provider']= train_data['affiliate_provider'].map(affiliate_provider_Cats_merger)
train_data['first_browser']= train_data['first_browser'].map(first_browser_Cats_merger)

test_x['signup_flow']= test_x['signup_flow'].map(signup_flow_Cats_merger)
test_x['language']= test_x['language'].map(language_Cats_merger)
test_x['affiliate_provider']= test_x['affiliate_provider'].map(affiliate_provider_Cats_merger)
test_x['first_browser']= test_x['first_browser'].map(first_browser_Cats_merger)

In [None]:
cat_list= ['gender','signup_method','signup_flow','language','affiliate_channel',
           'affiliate_provider','first_affiliate_tracked','signup_app','first_device_type',
           'first_browser','month_first_book','quarter_first_book',
           'DayOfWeek_first_book','month_account_created','quarter_account_created',
           'DayOfWeek_account_created','Holiday_account_created','Weekend_first_book','Weekend_account_created', 'country_destination']
len(cat_list)

## Visualizing the distribution of the 4 categorical columns whose rare categories got merged:

In [None]:
ls_of_column_counts_df_lists= []
for column in cat_list[0:19]:
    grouped_tab= train_data[['country_destination', column,'id']].groupby(['country_destination', column]).count()
    grouped_tab= grouped_tab.rename(columns={'id':'{}_count'.format(column)})
    grouped_tab= grouped_tab.reset_index()
    column_counts_df_list= []
    for country in train_data['country_destination'].unique():
        grouped_tab_country= grouped_tab[grouped_tab['country_destination']== country]
        column_counts_df_list.append(grouped_tab_country)
    ls_of_column_counts_df_lists.append(column_counts_df_list)

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[9]:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(25, 20))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[5]:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[3]:
    axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
    axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
    axes[i,j].set_xlabel(''.format(df.columns[1]))
    axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
i=0
j=0
ind=0
#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for df in ls_of_column_counts_df_lists[2]:
    if len(ls_of_column_counts_df_lists[2][ind])>0:
        axes[i,j].bar(df.iloc[:, 1],df.iloc[:, 2], color ='maroon')
        axes[i,j].set_title('{}_count_in_'.format(df.columns[1])+ df.iloc[0, 0])
        axes[i,j].set_xlabel(''.format(df.columns[1]))
        axes[i,j].set_ylabel('count_in_{}'.format(df.iloc[0, 0]))
    if(j<2):
        j+=1
    else:
        i+=1
        j=0
    ind+=1
print(df.columns[1])
plt.xticks()
plt.show()

In [None]:
cat_list= [ 'gender', 'signup_method', 'signup_flow', 'language',
       'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
       'signup_app', 'first_device_type', 'first_browser',
       'country_destination', 'month_first_book', 'quarter_first_book',
       'DayOfWeek_first_book', 'month_account_created',
       'quarter_account_created', 'DayOfWeek_account_created',
       'Holiday_account_created', 'Weekend_first_book',
       'Weekend_account_created']
for col in cat_list:
    print(col + ' has unique values {}'.format(train_data[col].unique()) 
          + ' which are {} categories'.format(len(train_data[col].unique())))
    print()

## Variables Scaling and Transformation :

### 1- we will perform Standardization on the age variable
### 2- drop the date_account_created, timestamp_first_active, date_first_booking
### 3- perform one hot encoding on all the categorical columns


#### Since the age in the histograms visualization was right skewed, then we can perform logarithmic transformation on it to make the distribution nearly normal 

In [None]:
#transofrming the age on a logarithmic scale
train_data.age=np.log(train_data.age+1)
test_x.age=np.log(test_x.age+1)
test_x

### The age distributions after the logarithmic transformation:

In [None]:
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(30, 50))
i=0
j=0

#for col_cnts_df_list in ls_of_column_counts_df_lists[0:2]:
for country in train_data['country_destination'].unique():
    axes[i, j].hist(train_data[(train_data['age']<=100) & (train_data['country_destination']== country)]['age'], bins=80)
    axes[i, j].set_title('age_distribution_in_{}'.format(country))
    if(j<1):
        j+=1
    else:
        i+=1
        j=0
plt.xticks()
plt.show()

### We need to drop the ID, and transform the features as follows:
#### 1- Cyclic features: month_first_book, quarter_first_book, DayOfWeek_first_book, month_account_created, quarter_account_created,
#### DayOfWeek_account_created
#### 2- Categorical features to be one hot encoded: 'gender', 'signup_method', 'signup_flow', 'language','affiliate_channel',
#### 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser','Holiday_account_created',
#### 'Weekend_first_book', 'Weekend_account_created'


In [None]:
train_data.drop(columns= ['id'], inplace=True)
test_x_id= test_x.id
test_x.drop(columns= ['id'], inplace=True)

In [None]:
train_data.isna().sum()

filling the nan values in age by the mean of log values for age

In [None]:
train_data['age']= train_data['age'].fillna(train_data['age'].mean())
test_x['age']= test_x['age'].fillna(train_data['age'].mean())

In [None]:
train_data['first_affiliate_tracked']= train_data['first_affiliate_tracked'].fillna('unk')
test_x['first_affiliate_tracked']= test_x['first_affiliate_tracked'].fillna('unk')

In [None]:
# filling the nan values by the mode:
train_data['month_first_book']= train_data['month_first_book'].fillna(train_data['month_first_book'].mode()[0])
train_data['quarter_first_book']= train_data['quarter_first_book'].fillna(train_data['quarter_first_book'].mode()[0])
train_data['DayOfWeek_first_book']= train_data['DayOfWeek_first_book'].fillna(train_data['DayOfWeek_first_book'].mode()[0])

test_x['month_first_book']= test_x['month_first_book'].fillna(train_data['month_first_book'].mode()[0])
test_x['quarter_first_book']= test_x['quarter_first_book'].fillna(train_data['quarter_first_book'].mode()[0])
test_x['DayOfWeek_first_book']= test_x['DayOfWeek_first_book'].fillna(train_data['DayOfWeek_first_book'].mode()[0])


In [None]:
train_data.isna().sum()
test_x.isna().sum()

One Hot Encoding the Categorical Features:

In [None]:
cat_train_data= train_data[['gender', 'signup_method', 'signup_flow', 'language','affiliate_channel',
            'affiliate_provider', 'first_affiliate_tracked',
            'signup_app', 'first_device_type', 'first_browser',
            'Holiday_account_created', 'Weekend_first_book', 'Weekend_account_created']]
cat_test_x= test_x[['gender', 'signup_method', 'signup_flow', 'language','affiliate_channel',
            'affiliate_provider', 'first_affiliate_tracked',
            'signup_app', 'first_device_type', 'first_browser',
            'Holiday_account_created', 'Weekend_first_book', 'Weekend_account_created']]

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder(sparse=False)

cat_train_data = cat_encoder.fit_transform(cat_train_data)
cat_one_hot_df=pd.DataFrame(cat_train_data) #columns=col_names)

cat_test_x = cat_encoder.fit_transform(cat_test_x)
cat_test_x_one_hot_df= pd.DataFrame(cat_test_x)
cat_test_x_one_hot_df

In [None]:
cyclic_cols_df= train_data[['month_first_book', 'quarter_first_book', 'DayOfWeek_first_book',
                        'month_account_created', 'quarter_account_created', 'DayOfWeek_account_created']]
T_cyclic_cols_df= test_x[['month_first_book', 'quarter_first_book', 'DayOfWeek_first_book',
                        'month_account_created', 'quarter_account_created', 'DayOfWeek_account_created']]

In [None]:
import math
cyclic_cols_df['month_first_book'] = 2 * math.pi * cyclic_cols_df['month_first_book'] / cyclic_cols_df['month_first_book'].max()
cyclic_cols_df['month_first_book'] = np.cos(cyclic_cols_df['month_first_book'])

cyclic_cols_df['quarter_first_book'] = 2 * math.pi * cyclic_cols_df['quarter_first_book'] / cyclic_cols_df['quarter_first_book'].max()
cyclic_cols_df['quarter_first_book'] = np.cos(cyclic_cols_df['quarter_first_book'])

cyclic_cols_df['DayOfWeek_first_book'] = 2 * math.pi * cyclic_cols_df['DayOfWeek_first_book'] / cyclic_cols_df['DayOfWeek_first_book'].max()
cyclic_cols_df['DayOfWeek_first_book'] = np.cos(cyclic_cols_df['DayOfWeek_first_book'])

cyclic_cols_df['month_account_created'] = 2 * math.pi * cyclic_cols_df['month_account_created'] / cyclic_cols_df['month_account_created'].max()
cyclic_cols_df['month_account_created'] = np.cos(cyclic_cols_df['month_account_created'])

cyclic_cols_df['quarter_account_created'] = 2 * math.pi * cyclic_cols_df['quarter_account_created'] / cyclic_cols_df['quarter_account_created'].max()
cyclic_cols_df['quarter_account_created'] = np.cos(cyclic_cols_df['quarter_account_created'])

cyclic_cols_df['DayOfWeek_account_created'] = 2 * math.pi * cyclic_cols_df['DayOfWeek_account_created'] / cyclic_cols_df['DayOfWeek_account_created'].max()
cyclic_cols_df['DayOfWeek_account_created'] = np.cos(cyclic_cols_df['DayOfWeek_account_created'])

In [None]:
T_cyclic_cols_df['month_first_book'] = 2 * math.pi * T_cyclic_cols_df['month_first_book'] / T_cyclic_cols_df['month_first_book'].max()
T_cyclic_cols_df['month_first_book'] = np.cos(T_cyclic_cols_df['month_first_book'])

T_cyclic_cols_df['quarter_first_book'] = 2 * math.pi * T_cyclic_cols_df['quarter_first_book'] / T_cyclic_cols_df['quarter_first_book'].max()
T_cyclic_cols_df['quarter_first_book'] = np.cos(T_cyclic_cols_df['quarter_first_book'])

T_cyclic_cols_df['DayOfWeek_first_book'] = 2 * math.pi * T_cyclic_cols_df['DayOfWeek_first_book'] / T_cyclic_cols_df['DayOfWeek_first_book'].max()
T_cyclic_cols_df['DayOfWeek_first_book'] = np.cos(T_cyclic_cols_df['DayOfWeek_first_book'])

T_cyclic_cols_df['month_account_created'] = 2 * math.pi * T_cyclic_cols_df['month_account_created'] / T_cyclic_cols_df['month_account_created'].max()
T_cyclic_cols_df['month_account_created'] = np.cos(T_cyclic_cols_df['month_account_created'])

T_cyclic_cols_df['quarter_account_created'] = 2 * math.pi * T_cyclic_cols_df['quarter_account_created'] / T_cyclic_cols_df['quarter_account_created'].max()
T_cyclic_cols_df['quarter_account_created'] = np.cos(T_cyclic_cols_df['quarter_account_created'])

T_cyclic_cols_df['DayOfWeek_account_created'] = 2 * math.pi * T_cyclic_cols_df['DayOfWeek_account_created'] / T_cyclic_cols_df['DayOfWeek_account_created'].max()
T_cyclic_cols_df['DayOfWeek_account_created'] = np.cos(T_cyclic_cols_df['DayOfWeek_account_created'])

In [None]:
#Standardizing the age column:
from sklearn.preprocessing import StandardScaler
st_cols= train_data[['age']]
st_cols= pd.DataFrame(StandardScaler().fit_transform(st_cols), columns=['age'] )

T_st_cols= test_x[['age']]
T_st_cols= pd.DataFrame(StandardScaler().fit_transform(T_st_cols), columns=['age'] )
T_st_cols

### Merging the preprocessed dataframes:

In [None]:
# st_cols, cyclic_cols_df, cat_one_hot_df
num_df= pd.merge(st_cols,cyclic_cols_df, right_index= True, left_index=True )
preprocessed_data= pd.merge(num_df, cat_one_hot_df,right_index= True, left_index=True )

T_num_df= pd.merge(T_st_cols,T_cyclic_cols_df, right_index= True, left_index=True)
T_preprocessed_data= pd.merge(T_num_df,cat_test_x_one_hot_df,right_index= True, left_index=True)
T_preprocessed_data

In [None]:
y= train_data['country_destination']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y= le.transform(y)

### Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, y, test_size=0.25, random_state=42)

## Modeling

In [None]:
from xgboost import XGBClassifier, plot_importance
xgb = XGBClassifier(use_label_encoder=False)                  
xgb.fit(X_train, y_train)

### Predictions for Development set

In [None]:
y_pred = xgb.predict_proba(X_test)

In [None]:
plot_importance(xgb, max_num_features=10)
f_importances= xgb.feature_importances_

In [None]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(range(12))

In [None]:
y_test_enc = lb.transform(y_test)

In [None]:
from sklearn import metrics
model_1_score= metrics.ndcg_score(y_true= y_test_enc,
                           y_score= y_pred,
                           k=5, sample_weight=None, ignore_ties=False)
model_1_score

In [None]:
test_predictions= xgb.predict_proba(T_preprocessed_data)

In [None]:
#xgb.get_score(fmap='', importance_type='weight')
#xgb.get_fscore()
feature_important = xgb.get_booster().get_score(importance_type='weight')
f_importance= pd.DataFrame.from_dict(feature_important, orient='index')
f_importance= f_importance.reset_index()
f_importance= f_importance.rename(columns={'index': 'feature', 0: 'f_score'})
f_importance.sort_values(by= 'f_score', ascending=False).head(40)

In [None]:
imp_feature_ls= f_importance.sort_values(by= 'f_score', ascending=False).head(50)['feature'].tolist()

### Retrying the model with the top 50 important features:

In [None]:
X_train= X_train.rename(columns= {     'age':'age', 'month_first_book':'month_first_book',
             'quarter_first_book': 'quarter_first_book',  'DayOfWeek_first_book':'DayOfWeek_first_book',
           'month_account_created': 'month_account_created',   'quarter_account_created':'quarter_account_created', 'DayOfWeek_account_created':
       'DayOfWeek_account_created', 0 : '0',1 : '1', 2 : '2',3 : '3',4 : '4',5 : '5',6 : '6',7 : '7',8 : '8',9 : '9', 10 :'10',11 :'11', 
                                  12 :'12',13 :'13',14 :'14',15 :'15',16 :'16',17 :'17',18 :'18',19 :'19',20 :'20',21 :'21',  
                                  22 :'22',23 :'23',24 :'24',25 :'25',26 :'26',27 :'27', 28 :'28',29 :'29',30 :'30',31 :'31',32 :'32',33 :'33',
                                  34 :'34',35 :'35',36 :'36',37 :'37',38 :'38',39 :'39',40 :'40',41 :'41', 42 :'42',43 :'43',44 :'44',45 :'45', 
                                  46 :'46',47 :'47',48 :'48',49 :'49',50 :'50',51 :'51',52 :'52',53 :'53',54 :'54',55 :'55',56 :'56',57 :'57', 
                                  58 :'58',59 :'59',60 :'60',61 :'61',62 :'62',63 :'63',  64 :'64',65 :'65',66 :'66',67 :'67',68 :'68',69 :'69',70 :'70'})

In [None]:
preprocessed_data= preprocessed_data.rename(columns= { 'age':'age', 'month_first_book':'month_first_book',
             'quarter_first_book': 'quarter_first_book',  'DayOfWeek_first_book':'DayOfWeek_first_book',
           'month_account_created': 'month_account_created',   'quarter_account_created':'quarter_account_created', 'DayOfWeek_account_created':
       'DayOfWeek_account_created', 0 : '0',1 : '1', 2 : '2',3 : '3',4 : '4',5 : '5',6 : '6',7 : '7',8 : '8',9 : '9', 10 :'10',11 :'11', 
                                  12 :'12',13 :'13',14 :'14',15 :'15',16 :'16',17 :'17',18 :'18',19 :'19',20 :'20',21 :'21',  
                                  22 :'22',23 :'23',24 :'24',25 :'25',26 :'26',27 :'27', 28 :'28',29 :'29',30 :'30',31 :'31',32 :'32',33 :'33',
                                  34 :'34',35 :'35',36 :'36',37 :'37',38 :'38',39 :'39',40 :'40',41 :'41', 42 :'42',43 :'43',44 :'44',45 :'45', 
                                  46 :'46',47 :'47',48 :'48',49 :'49',50 :'50',51 :'51',52 :'52',53 :'53',54 :'54',55 :'55',56 :'56',57 :'57', 
                                  58 :'58',59 :'59',60 :'60',61 :'61',62 :'62',63 :'63',  64 :'64',65 :'65',66 :'66',67 :'67',68 :'68',69 :'69',70 :'70'})

In [None]:
Selected_features_data= preprocessed_data[imp_feature_ls]

In [None]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(Selected_features_data, y, test_size=0.25, random_state=42)
xgb.fit(X_train_, y_train_)

In [None]:
y_pred_ = xgb.predict_proba(X_test_)

In [None]:
y_test_enc_ = lb.transform(y_test_)

In [None]:
model_2_score= metrics.ndcg_score(y_true= y_test_enc_,
                           y_score= y_pred_,
                           k=5, sample_weight=None, ignore_ties=False)
model_2_score