# 1. Business Understanding

# 2. Data Integration

## 2.1 Libraries Import

In [1]:
# Necessary installations

#!pip install seaborn
#!pip install matplotlib
#!pip install numpy
#!pip install pandas
#!pip install scikit-learn
#!pip install scipy


# Necessary imports

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA


%matplotlib inline

# for better resolution plots
%config InlineBackend.figure_format = 'retina'

# setting the style of seaborn
sns.set(rc={'figure.figsize':(11.7,8.27)})

## 2.2 Data Load

In [8]:
# Importing the dataset
df = pd.read_csv('Case1_HotelCustomerSegmentation.csv', sep=';')
df.set_index('ID', inplace=True)

### 2.2.1 Metadata

ID: Customer ID

Nationality: Nationality of the customer in ISO 3166-1 (Alpha 3) format

Age: The age of the customer

DaysSinceCreation: Number of elapsed days since the customer was created

NameHash: Hash of the customer's name

DocIDHash: Hash of the customer’s personal document identification number (usually a passport or ID card)

AverageLeadTime: Average number of days before arrival date the customer makes bookings

LodgingRevenue: Total amount of lodging revenue paid by the customer so far

OtherRevenue: Total amount of other revenue (e.g., food & beverage, spa, etc.) paid by the customer so far

BookingsCanceled: Number of bookings the customer made but subsequently canceled

BookingsNoShowed: Number of bookings the customer made but subsequently made a "no-show"

BookingsCheckedin: Number of bookings the customer made, which actually ended up staying

PersonNights: Total person/nights the customer has stayed at the hotel so far. Persons/Nights are the sum of Adults and Children in
each booking, multiplied by the number of Nights (Length-of-stay) of the booking

RoomNights: Total of room/nights the customer has stayed at the hotel so far. Room/Nights are the multiplication of the number of
rooms of each booking by the the number of Nights (Length-of stay) of the booking

DistributionChannel: Distribution channel normally used by the customer to make bookings at the hotel

MarketSegment: Current market segment of the customer

SRHighFloor: Indication if the customer usually asks for a room in a higher floor (0: No, 1: Yes)

SRLowFloor: Indication if the customer usually asks for a room in a lower floor (0: No, 1: Yes)

SRAccessibleRoom: Indication if the customer usually asks for an accessible room (0: No, 1: Yes)

SRMediumFloor: Indication if the customer usually asks for a room in a middle floor (0: No, 1: Yes)

SRBathtub Indication if the customer usually asks for a room with a bathtub (0: No, 1: Yes)

SRShower: Indication if the customer usually asks for a room with a shower (0: No, 1: Yes)

SRCrib: Indication if the customer usually asks for a crib (0: No, 1: Yes)

SRKingSizeBed: Indication if the customer usually asks for a room with a king size bed (0: No, 1: Yes)

SRTwinBed Indication if the customer usually asks for a room with a twin bed (0: No, 1: Yes)

SRNearElevator: Indication if the customer usually asks for a room near the elevator (0: No, 1: Yes)

SRAwayFromElevator: Indication if the customer usually asks for a room away from the elevator (0: No, 1: Yes)

SRNoAlcoholInMiniBar: Indication if the customer usually asks for a room with no alcohol in the mini bar (0: No, 1: Yes)

SRQuietRoom: Indication if the customer usually asks for a room away from the noise (0: No, 1: Yes)

NOTE: All time-based columns (e.g., Age or DaysSinceCreation) were calculated at the dataset extraction date.

### 2.2.2 Data Overview

In [9]:
df.head()

Unnamed: 0_level_0,Nationality,Age,DaysSinceCreation,NameHash,DocIDHash,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,...,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,PRT,52.0,440,0x2C371FD6CE12936774A139FD7430C624F1C4D5109CE6...,0x434FD3D59469C73AFEA087017FAF8CA2296493AEABDE...,59,292.0,82.3,1,0,...,0,0,0,0,0,0,0,0,0,0
2,PRT,,1385,0x198CDB98BF37B6E23F9548C56A88B00912D65A9AA0D6...,0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B...,61,280.0,53.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,DEU,32.0,1385,0xDA46E62F66936284DF2844EC4FC542D0DAD780C0EE0C...,0x27F5DF762CCDA622C752CCDA45794923BED9F1B66300...,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,FRA,61.0,1385,0xC45D4CD22C58FDC5FD0F95315F6EFA5A6E7149187D49...,0x8E59572913BB9B1E6CAA12FA2C8B7BF387B1D1F3432E...,93,240.0,60.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,FRA,52.0,1385,0xD2E3D5BFCA141865669F98D64CDA85AD04DEFF47F8A0...,0x42BDEE0E05A9441C94147076EDDCC47E604DA5447DD4...,0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 3. Data Exploration