# AirBNB Customer Segments Analysis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Data

### Users Data

In [None]:
data = pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
data.head(5)

In [None]:
data.info()

In [None]:
customers_data = data.sample(frac = 0.1, random_state = 42)

### Sessions Data

In [None]:
sessions_data = pd.read_csv("../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip")
sessions_data.info()

In [None]:
sessions_data.head(5)

In [None]:
sessions_data['user_id'].duplicated()

### Datasets Aggregation and Join

In [None]:
sessions_data = pd.DataFrame(sessions_data.groupby('user_id')['secs_elapsed'].mean()).reset_index()
sessions_data.head(5)

### Seconds Elapsed

In [None]:
sessions_data['minutes_elapsed'] = np.log1p(sessions_data['secs_elapsed'] / 60)
sessions_data['minutes_elapsed'].plot(kind = 'hist')

In [None]:
sessions_data.drop(columns = 'secs_elapsed', inplace = True)

In [None]:
sessions_data.rename({'user_id': 'id', 'minutes_elapsed': 'mean_minutes_elapsed'}, axis = 'columns', inplace = True)
customers_data = pd.merge(customers_data, sessions_data, on = 'id', how = 'left')
customers_data.head(5)

## Preprocessing

### Time Features

In [None]:
# Features Related to Time
# date_account_created
customers_data['date_account_created'] = pd.to_datetime(customers_data['date_account_created'], format = '%Y-%m-%d')

# Splitting on dates
customers_data['yr_date_account_created'] = customers_data['date_account_created'].dt.year
customers_data['m_date_account_created'] = customers_data['date_account_created'].dt.month
customers_data['d_date_account_created'] = customers_data['date_account_created'].dt.day

# # timestamp_first_active
# customers_data['timestamp_first_active'] = pd.to_datetime(customers_data['timestamp_first_active'], format = '%Y%m%d%H%M%S')
# customers_data['m_timestamp_first_active'] = customers_data['timestamp_first_active'].dt.month

### Demographic and Event Features

In [None]:
# Demographic Features - Associated with information about customers
demographic_features = ['gender', 'language']
# Event Features - Associated with what happens in the website
event_features = ['signup_method',
                 'affiliate_channel',
                 'affiliate_provider',
                 'first_affiliate_tracked',
                 'signup_app', 
                 'first_device_type',
                 'first_browser']
# The KPI
customers_data.loc[customers_data['country_destination'] == 'NDF', 'Booked'] = 'No'
customers_data.loc[customers_data['country_destination'] != 'NDF', 'Booked'] = 'Yes'

# All categorical features
categorical_features =  []
categorical_features.append(demographic_features)
categorical_features.append(event_features)
categorical_features.append(["Booked"])

# Dropping unnecessary columns
columns_to_drop = ['id',
                   'date_account_created', 
                   'timestamp_first_active', 
                   'date_first_booking', 
                   'age', 
                   'signup_flow', 
                   'country_destination']

customers_data.drop(columns_to_drop, axis = 'columns', inplace = True)

# Final Categorical features list
categorical_features = [feature for sublist in categorical_features for feature in sublist]

customers_data.info()

### Filling NaNs

In [None]:
customers_data.loc[customers_data['first_affiliate_tracked'].isna(), 'first_affiliate_tracked'] = customers_data['first_affiliate_tracked'].mode()[0]
customers_data['mean_minutes_elapsed'].fillna(customers_data['mean_minutes_elapsed'].mean(), inplace = True)

In [None]:
customers_data.info()

In [None]:
# Get the position of categorical columns
cat_columns_pos = [customers_data.columns.get_loc(col) for col in categorical_features]
print(f"Categorical columns: {list(customers_data.select_dtypes('object').columns)}")
print(f"Categorical columns position: {cat_columns_pos}")

## Modeling on the Dataset

### K-Prototype Clustering Model

In [None]:
# Credit goes to: 
# https://towardsdatascience.com/the-k-prototype-as-clustering-algorithm-for-mixed-data-type-categorical-and-numerical-fe7c50538ebb

from kmodes.kprototypes import KPrototypes

from plotnine import *
import plotnine

from tqdm.notebook import tqdm

df_matrix = customers_data.to_numpy()

# Choose optimal K using Elbow method
cost = []
for cluster in tqdm(range(1, 10)):
    kprototype = KPrototypes(n_jobs = -1, n_clusters = cluster, init = 'Huang', random_state = 42)
    kprototype.fit_predict(df_matrix, categorical = cat_columns_pos)
    cost.append(kprototype.cost_)
    print('Cluster initiation: {}'.format(cluster))
        
# Converting the results into a dataframe and plotting them
df_cost = pd.DataFrame({'Cluster':range(1, 10), 'Cost':cost})
# Data viz
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(data = df_cost)+
    geom_line(aes(x = 'Cluster',
                  y = 'Cost'))+
    geom_point(aes(x = 'Cluster',
                   y = 'Cost'))+
    geom_label(aes(x = 'Cluster',
                   y = 'Cost',
                   label = 'Cluster'),
               size = 10,
               nudge_y = 1000) +
    labs(title = 'Optimal number of cluster with Elbow Method')+
    xlab('Number of Clusters k')+
    ylab('Cost')+
    theme_minimal()
)

In [None]:
# Modeling using the best K
kprototype = KPrototypes(n_jobs = -1, n_clusters = 6, init = 'Huang', random_state = 42)
kprototype.fit_predict(df_matrix, categorical = cat_columns_pos)

### Clustering Interpretation

In [None]:
kprototype.cluster_centroids_

In [None]:
customers_data['cluster'] = kprototype.labels_
customers_data['mean_minutes_elapsed'] = np.expm1(customers_data['mean_minutes_elapsed']) 
customers_data.head(5)

In [None]:
customers_data.to_csv('customers_data.csv')

In [None]:
customers_data.groupby('cluster')['Booked'].value_counts(normalize = True)

In [None]:
import seaborn as sns

# Plotting each cluster with its proportions
sns.heatmap(pd.crosstab(customers_data['cluster'],
                        customers_data['Booked'],
                        customers_data['Booked'], aggfunc = 'count').apply(lambda r: r/r.sum(), axis=1), annot = True)

### Customer Segment 1 Properties

In [None]:
customers_data.to_csv('customers_data.csv')

In [None]:
customers_data['cluster'].value_counts(normalize = True)