In [None]:
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['font.size'] = 15

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

[1: Exploratory DATA Analysis](#1)
- [1.1: Load the data](#1.1)
- [1.2: Add a few columns based on data description](#1.2)
- [1.3: Basic info on data](#1.3)
- [1.4: Stat numerical data](#1.4)
- [1.5: Stat categorical data](#1.5)

[2: Data extraction](#2)
- [2.1: Homonyme](#2)
- [2.2: Number of member per familly](#2.2)
- [2.3: Number of kids in the familly](#2.3)
- [2.4: Number of passenger per group](#2.4)
- [2.5: Occurence of same first name](#2.5)
- [2.6: Cabin number](#2.6)
- [2.7: Number of option](#2.7)

[3: Research of dependency between variables](#3)
- [3.1: Dependency and relation with: **'Age'**](#3.1)
    - [3.1.1: Relations with categorical variables](#3.1)
    - [3.1.2: Relations with categorical and bool variables](#3.1.2)
- [3.2: Dependency and relation with: **'Group'**](#3.2)
    - [3.2.1: Relations with categorical variables](#3.2)
    - [3.2.2: Relations with categorical and bool variables](#3.2.2)
- [3.3: Dependency and relation with: **'Number within group'**](#3.3)
    - [3.3.1: Relations with categorical variables](#3.3)
    - [3.3.2: Relations with categorical and bool variables](#3.3.2)
- [3.4: Dependency and relation with: **'Occurence_First_Name'**](#3.4)
    - [3.4.1: Relations with categorical variables](#3.4)
    - [3.4.2: Relations with categorical and bool variables](#3.4.2)
- [3.5: Dependency and relation with: **'Len_cab2'**](#3.5)
    - [3.5.1: Relations with categorical variables](#3.5)
    - [3.5.2: Relations with categorical and bool variables](#3.5.2)
- [3.6: Dependency and relation with: **'Number_of_option'**](#3.6)
    - [3.5.1: Relations with categorical variables](#3.6)
    - [3.5.2: Relations with categorical and bool variables](#3.6.2)

[4: Replacing missing data](#4)

   - [4.1: Replacing missing data based on observed relations](#4.1)
   - [4.2: Replacing missing data with median for specific group](#4.2)
   - [4.3: Replacing missing value with median calculated on the entier population](#4.3)
   - [4.4: Add one last new feature](#4.4)

[5: Correlations between variables](#5)

[6: Machine learning](#6)

  - [6.1: Data preprocessing](#6.1)
  - [6.2: Model selection](#6.2)
  - [6.3: Feature selection](#6.3)
  - [6.4: Hyperparamter tunning](#6.4)

## <a id="1"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">1: Exploratory Data Analysis
</p>
</div>

## <a id="1.1"></a>
## 1.1: Load the data

In [None]:
df_train = pd.read_csv(r"../input/spaceship-titanic/train.csv")
df_train['df']='train'
print(df_train.shape)
df_train.head(2)

In [None]:
df_test =pd.read_csv(r"../input/spaceship-titanic/test.csv")
df_test['df']='test'
print(df_test.shape)
df_test.head(2)

In [None]:
# define explanatory data and target
target = df_train['Transported']
df_train = df_train.drop(['Transported'],axis=1)

all_data = pd.concat([df_train,df_test]).reset_index()
print(all_data.shape)
all_data.head(2)

## <a id="1.2"></a>
## 1.2: Add a few columns based on data description.

In [None]:
# column 'Id' takes the form gggg_pp where gggg = group the passenger is travelling with 
# and pp is their number within the group
all_data[['group','nb_within_group']]=all_data['PassengerId'].str.split('_',expand=True).apply(pd.to_numeric)

# separate first name and last name:
all_data[['first_name','last_name']]=all_data['Name'].str.split(' ',expand=True)

# drop passengerID
all_data=all_data.drop(['PassengerId'],axis=1)
print(all_data.shape)

all_data.head(2)

## <a id="1.3"></a>
## 1.3: Basic info on data

In [None]:
all_data.info()

In [None]:
all_data.isnull().sum()

## <a id="1.4"></a>
## 1.4: Stat numerical data

In [None]:
# basic stat
all_data.describe()

In [None]:
# plot distribution numerical data:
fig, axes = plt.subplots(3,3,figsize=(15,10))
fig.subplots_adjust(wspace=0.4,hspace=0.3)
ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = axes.flatten() 

# figure title
fig.suptitle("Distribution numerical vaiables:", fontsize=18)

# subplot
sns.histplot(data=all_data, x='Age', color="darkblue", kde=True, stat="density", linewidth=1.0,ax=ax1)
sns.histplot(data=all_data, x='group', color="red" ,kde=True, stat="density", linewidth=1.0,ax=ax2)
sns.countplot(data=all_data, x='nb_within_group', ax=ax3)
for p in ax3.patches:
    ax3.annotate("{}%".format(int(p.get_height()*100/len(all_data))), (p.get_x() + p.get_width() / 2., p.get_height()), 
               ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
sns.boxplot(data=all_data, x="RoomService", ax=ax4)
sns.boxplot(data=all_data, x="FoodCourt", ax=ax5)
sns.boxplot(data=all_data, x="ShoppingMall", ax=ax6)
sns.boxplot(data=all_data, x="Spa", ax=ax7)
sns.boxplot(data=all_data, x="VRDeck", ax=ax8)
plt.show()

It seems 'Roomsercive', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' are mostly outliers, but:
- How many passengers took these options?
- Did they take several options?
- How these outliers are distributed

In [None]:
list_option = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in list_option:
    df_option = all_data[all_data[col]>0]
    # log10 transform
    all_data[col] = all_data[col].fillna(0)
    all_data.loc[all_data[col] > 0, col] = np.log10(all_data[col])
    print("There are only {} passenger on {} who took the option {}".format(df_option.shape[0],all_data.shape[0],col))

fig = plt.figure(figsize=(15,5))
fig.subplots_adjust(hspace = 0.3, wspace=0.5)
plt.suptitle('Amount (higer than 0) spent by passengers for:')
for i,col in zip (range(1,6),list_option):
    df_col =all_data[all_data[col]>0]
    ax = fig.add_subplot(1,5,i)
    ax = sns.histplot(data=df_col, x=col)  
plt.show(
)

## <a id="1.5"></a>
## 1.5: Stat categorical data

In [None]:
all_data.describe(exclude=[np.number])

- There are 12969 passengers distributed into 9825 cabins.
- There are possibly 2406 families, but maybe more because 46 passengers have the same name (12629 names + 294 missing values + 46 = total passengers).

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(wspace=0.4)

# figure title
fig.suptitle("Distribution categorical vaiables present in 'all_data':", fontsize=18, y=0.95)

# subplot
ax1 = fig.add_subplot(2,2,1)
ax1 = all_data['HomePlanet'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f")
ax2 = fig.add_subplot(2,2,2)
ax2 = all_data['CryoSleep'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f")
ax3 = fig.add_subplot(2,2,3)
ax3 = all_data['Destination'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f")
ax4 = fig.add_subplot(2,2,4)
ax4 = all_data['VIP'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f")
plt.show()

## <a id="2"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">2: Data extraction
</p>
</div>

## 2.1: homonyme

To deal with passengers having the same name, we do the following hypothesis:
- if have different homeland or different destination there are not from the same familly.
- if same homeland, same destination and large age difference (kid and parent) they are from the same familly.


In [None]:
# Step1: find duplicate name:
df_duplicate = all_data[all_data.duplicated(['Name'], keep=False)].dropna(subset=['Name']).sort_values(by=['Name'])
print(df_duplicate.shape)

In [None]:
# Step 2: Apply conditions
# change name if different destination
df_duplicate['condition1'] = (df_duplicate['Name'] == df_duplicate['Name'].shift(1)) & (df_duplicate['Destination'] != df_duplicate['Destination'].shift(1))
df_duplicate.loc[df_duplicate['condition1'].eq(True), 'Name']  += '-2'
# change name if age difference is too small (you may call your child like you but not give the same name to your kids) 
df_duplicate['condition2'] = (df_duplicate['Name'] == df_duplicate['Name'].shift(1)) & (df_duplicate['Age'] - df_duplicate['Age'].shift(1)<16)
df_duplicate.loc[df_duplicate['condition2'].eq(True), 'Name']  += '-2'
# drop unwanted columns
df_duplicate = df_duplicate.drop(['condition1','condition2'],axis=1)
# lets check how many name we changed 
nb_name_change = df_duplicate['Name'].str.count('-2').sum()
print("We changed {} names on 20 duplicates".format(nb_name_change))

In [None]:
# Step 3: update df:
# first: drop all the rows with duplicate name in df
row_to_drop = df_duplicate.index
print('Number Homonyne (rows to drop)',len(row_to_drop))
row_to_drop = all_data.index[row_to_drop]
print('Shape df before dropping', all_data.shape)
all_data.drop(row_to_drop, inplace=True)
print('Shape df after dropping',all_data.shape)

# Then: add df_duplicate with new names
all_data = pd.concat([all_data,df_duplicate])
print('Shape df after updating',all_data.shape)

## <a id="2.2"></a> 2.2: Number of member per familly

In [None]:
# Familly number:
all_data['nb_in_familly'] = 1
df_familly = all_data.groupby(['last_name']).sum()
# create list of column to drop
col_to_drop = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','group','nb_within_group','index']
df_familly = df_familly.drop(col_to_drop,axis=1)
all_data = all_data.join(df_familly, on="last_name",lsuffix="_old")
all_data = all_data.drop(['nb_in_familly_old'],axis=1)
# for later used:
col_to_drop.append('nb_in_familly')

# Show results
fig = plt.figure(figsize=(10,5))
ax = sns.countplot(data=all_data, x='nb_in_familly',color='green')
for p in ax.patches:
    ax.annotate("{}%".format(int(p.get_height()*100/len(all_data))), (p.get_x() + p.get_width() / 2., p.get_height()), 
               ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.title('Dictribution of number of familly')
plt.show()

## <a id="2.3"></a>
## 2.3: Number of kids in the familly

In [None]:
# select the age intervals
df_baby = all_data[all_data['Age']<3]
df_kid = all_data[(all_data['Age']>2)&(all_data['Age']<13)]
df_teenager = all_data[(all_data['Age']<13)&(all_data['Age']<19)]
# create counter
df_baby['nb_baby'] = 1
df_kid['nb_kid'] = 1
df_kid['nb_teenager'] = 1
# calculate number per familly
df_baby = df_baby.groupby(['last_name']).sum()
df_kid = df_kid.groupby(['last_name']).sum()
df_teenager = df_teenager.groupby(['last_name']).sum()
# merge results with df
df_baby = df_baby.drop(col_to_drop,axis=1)
df_kid  = df_kid.drop(col_to_drop,axis=1)
df_teenager  = df_teenager.drop(col_to_drop,axis=1)
all_data = all_data.join(df_baby, on="last_name")
all_data = all_data.join(df_kid, on="last_name")
all_data = all_data.join(df_teenager, on="last_name")

# remove the Nan values
all_data['nb_baby'] = all_data['nb_baby'].fillna(0)
all_data['nb_kid'] = all_data['nb_kid'].fillna(0)
all_data['nb_teenager'] = all_data['nb_teenager'].fillna(0)

In [None]:
# create bool variable has kid or not
all_data['fam_with_baby']=False
all_data['fam_with_kid']=False
all_data['fam_with_teenager']=False
all_data['fam_with_child']=False
all_data.loc[all_data['nb_baby']>0, 'fam_with_baby'] = True
all_data.loc[all_data['nb_kid']>0, 'fam_with_kid'] = True
all_data.loc[all_data['nb_teenager']>0, 'fam_with_teenager'] = True
all_data.loc[(all_data['fam_with_baby']== True)|(all_data['fam_with_kid']== True)|(all_data['fam_with_teenager']== True), 'fam_with_child'] = True
col_to_drop.extend(['fam_with_baby','fam_with_kid','fam_with_teenager','fam_with_child','nb_baby','nb_kid','nb_teenager'])

In [None]:
# plot distribution numerical data:
fig, axes = plt.subplots(1,4,figsize=(15,4))
ax1, ax2, ax3, ax4 = axes.flatten() 
fig.suptitle("Distribution familly travelling with:baby, kids, teenager, or just child':", fontsize=18, y=0.95)
all_data['fam_with_baby'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax1)
all_data['fam_with_kid'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax2)
all_data['fam_with_teenager'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax3)
all_data['fam_with_child'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax4)
plt.show()

## <a id="2.4"></a>
**2.4: Number of passenger per group**

In [None]:
col_to_drop.remove('group')

In [None]:
# distribution of number of passenger per group:
all_data['nb_passenger_in_grp'] = 1
df_group = all_data.groupby(['group']).sum()
df_group = df_group.drop(col_to_drop,axis=1)
all_data = all_data.drop(['nb_passenger_in_grp'],axis=1)
# merge results with df
all_data = all_data.join(df_group, on="group",rsuffix="_r")

# Show results
ax = sns.countplot(data=all_data, x='nb_passenger_in_grp')
for p in ax.patches:
    ax.annotate("{}%".format(int(p.get_height()*100/len(all_data))), (p.get_x() + p.get_width() / 2., p.get_height()), 
               ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.show()

## <a id="2.5"></a>
**2.5: Occurrence of same first name**

In [None]:
all_data['nb_passenger_in_grp'] = 1
df_group = all_data.groupby(['group']).sum()
df_group = df_group.drop(col_to_drop,axis=1)
all_data = all_data.drop(['nb_passenger_in_grp'],axis=1)
# merge results with df
all_data = all_data.join(df_group, on="group",rsuffix="_r")

In [None]:
# Step1: create counter, find duplicate name, count them and drop unwanter columns:
all_data['occurence_FN'] = 1
df_group_FN = all_data.groupby(['first_name']).sum()
df_group_FN = df_group_FN.drop(col_to_drop,axis=1)
all_data = all_data.drop(['occurence_FN'],axis=1)
# merge results with df
all_data = all_data.join(df_group_FN, on="first_name",rsuffix="_r")

# Show results
fig = plt.figure(figsize=(10,4))
ax = sns.countplot(data = all_data, x='occurence_FN',color='purple')
ax.bar_label(ax.containers[0])
# for p in ax.patches:
#     ax.annotate("{}%".format(int(p.get_height()*100/len(df))), (p.get_x() + p.get_width() / 2., p.get_height()), 
#                ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.title('Dictribution of first name occurence')
plt.show()

## <a id="2.6"></a>
**2.6: Cabin number**

In [None]:
# do we have always the same name structure
count_special_character = all_data["Cabin"].str.count("/")
print('-'*30)
print("Number of'/' in cabin name: max {} and min {}.".format(count_special_character.max(),count_special_character.min()))
# create new df with cabin:
all_data[['cab1','cab2','cab3']]=all_data['Cabin'].str.split('/',expand=True)
all_data["len_Cabin"]= all_data["Cabin"].str.len()
print('-'*30)
print("The code for the cabin has between {} and {} characters".format(all_data["len_Cabin"].min(),all_data["len_Cabin"].max()))
all_data["len_cab2"]= all_data["cab2"].str.len()
print('-'*30)
print("The code for the cab2 has between {} and {} characters".format(all_data["len_cab2"].min(),all_data["len_cab2"].max()))
print('-'*30)
print('Characters in first part:', all_data['cab1'].unique())
print('Characters in second part:', all_data['cab2'].unique())
print('Characters in third part:', all_data['cab3'].unique())

In [None]:
# plot distribution numerical data:
fig, axes = plt.subplots(1,4,figsize=(15,5))
ax1, ax2, ax3, ax4 = axes.flatten() 
fig.suptitle("Distribution categorical vaiables obtained from 'Cabin':", fontsize=18, y=0.95)
all_data['cab1'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax1)
all_data['len_Cabin'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax2)
all_data['len_cab2'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax3)
all_data['cab3'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f",ax=ax4)
plt.show()


Len_cabin and len_cab2 are the same. We can drop len_Cabin

## <a id="2.7"></a>
**2.7: Number of option**

In [None]:
# create new columns with initial guess: the passenger doesn't have the option
all_data['Have_RoomService'] = False
all_data['Have_FoodCourt'] = False
all_data['Have_ShoppingMall'] = False
all_data['Have_Spa'] = False
all_data['Have_VRDeck'] = False
# update columns based
all_data.loc[all_data['RoomService']>0, 'Have_RoomService'] = True
all_data.loc[all_data['FoodCourt']>0, 'Have_FoodCourt'] = True
all_data.loc[all_data['ShoppingMall']>0, 'Have_ShoppingMall'] = True
all_data.loc[all_data['Spa']>0, 'Have_Spa'] = True
all_data.loc[all_data['VRDeck']>0, 'Have_VRDeck'] = True

In [None]:
# sum true and false value present in columns "have_{}".option
lis_col_with_option = ['Have_RoomService','Have_FoodCourt','Have_ShoppingMall','Have_Spa','Have_VRDeck',]
all_data['nb_option'] = all_data[lis_col_with_option].sum(axis=1)
all_data['nb_option'].value_counts(normalize=True).plot(kind='pie',autopct="%.1f")
plt.show()

- 42% of the passengers have 0 option, while on the opposite 2.8% have the 5 options.

In [None]:
# clean the df
all_data = all_data.drop(['len_Cabin','cab2','Cabin','Name','group_r', 'nb_passenger_in_grp_r'],axis=1)

## <a id="3"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">3: Research of dependency between variables
</p>
</div>

In [None]:
# create list of variables to be plot together:
list_col_numeric = list(all_data.select_dtypes(include=[np.number]).columns)
list_col_category = ['HomePlanet', 'Destination', 'cab1','cab3']
list_col_binary = ['VIP',  'CryoSleep','fam_with_baby', 'fam_with_kid', 'fam_with_teenager', 'fam_with_child','Have_RoomService', 'Have_FoodCourt', 'Have_ShoppingMall', 'Have_Spa', 'Have_VRDeck']
print('There are {} numerical variables'.format(len(list_col_numeric)))
print('There are {} object variables'.format(len(list_col_category)))
print('There are {} binary variables'.format(len(list_col_binary)))

## <a id="3.1"></a>
## 3.1: Dependency and relation with: 'Age'
### 3.1.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('Age vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.kdeplot(data=all_data, x="Age", hue=col, multiple="stack")   
plt.show()

We can see that:
- in proportion there are more child coming from Earth than from the other planet.
- the middle age vary with cab1.

## <a id="3.1.2"></a>
### 3.1.2: relation with categorical and bool variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Age - HomePlaney and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="Age", hue=col)
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

 It seems that
- in Subplot 1, no passenger from Earth or below 18 or 25 depending the homeplanet are in VIP.
- im subplot 7 to 11, the kids (ages vary from HomePlanet) don't have the options RoomService, FoodCourt, ShoppingMall, Spa, and VRDeck 

Let's check that!


In [None]:
# #  no kids where not transported form Europa and Mars 
# df_not_transported_kid = df[(df['Transported']==False)&(df['Age']<13)]
# df_not_transported_kid = df_not_transported_kid[['Transported','HomePlanet']]
# df_not_transported_kid['count']=1
# df_tk_result = df_not_transported_kid.groupby(['HomePlanet']).sum()
# df_tk_result

**There is no kid (below 13 years old) coming from Mars than have been transported, and only 3 from Europa.**

In [None]:
df_min_age = all_data[(all_data['HomePlanet']== 'Europa')&(all_data['VIP']==True)&(all_data['Age']<25)]
df_min_age 

**There is no passenger younger than 25 years old coming from Europa in VIP.**

In [None]:
df_min_age = all_data[(all_data['HomePlanet']== 'Mars')&(all_data['VIP']==True)&(all_data['Age']<18)]
df_min_age 

**There is no passenger younger than 18 years old coming from Mars in VIP.**

In [None]:
# df_min_age = df[(df['RoomService']== True)&(df['Age']<13)]
# df_min_age = df[(df['FoodCourt']== True)&(df['Age']<13)]
# df_min_age = df[(df['ShoppingMall']== True)&(df['Age']<13)]
# df_min_age = df[(df['Spa']== True)&(df['Age']<13)]
df_min_age = all_data[(all_data['VRDeck']== True)&(all_data['Age']<13)]
df_min_age 

**There is no passenger younger than 13 years with Roomservice, Foodcourt, ShoppingMall, Spa, and VRDeck.**

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Age - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="Age", hue=col)
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- in subplot 1, there are only 1 and 2 passengers above 50 and 60 in VIP travelling toward 'PSO J318 5_22' and '55 Cancri e', respectively.
- in subplot 3, there is no passenger with a bay in his family above 60 travelling toward '55 Cancri e'.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Age - letters in cab1 and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(4,3,i) 
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="Age", hue=col)
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 3)|(i== 6)|(i==9):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We observe that:
- in subplot 1, passengers who have the letter 'G' or 'T' are not VIP members.
- in subplot 7 to 11, passengers who have a service (food, spa, etc,,,) have the letter 'T'.
- in subplot 7, passengers over 50 with a kid in their familly don't have the letter 'B'.
- in subplot 8 to 12, passengers in VIP younger than 18 have no option. 

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Age - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="Age", hue=col)
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

In subplot 1, passenger in VIP younger tha 21 have the letter P

## <a id="3.2"></a>
## 3.2: Dependency and relation with 'group'
### 3.2.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('group vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.kdeplot(data=all_data, x="group", hue=col, multiple="stack",palette="Set2")   
plt.show()


## <a id="3.2.2"></a>
### 3.2.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Age - HomePlaney and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="group", hue=col,palette="Set2")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i) 
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="group", hue=col,palette="Set2")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- Nothing special here

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(4,3,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="group", hue=col,palette="Set2")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 3)|(i== 6)|(i==9):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We observe that:
- in subplot 4 to 7, passengers with kids and with the letter 'B' don't have a group number higher yhan 8000.
- Passengers with an option have the T.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i) 
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="group", hue=col,palette="Set2")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

## <a id="3.3"></a>
## 3.3: Dependency and relation with: "Number within group"
### 3.3.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('group vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.countplot(data=all_data, x="nb_within_group", hue=col, palette="Accent")  
plt.show()

It seems that:
- cab 3 is distributed 50/50 between groups.

Let's check that.

In [None]:
list_P = []
list_S = []
def count_value(nb):
    df_nb = all_data[all_data['nb_within_group']==nb]
    print(df_nb['cab3'].value_counts())
    list_P.append(df_nb['cab3'].value_counts().P)
    list_S.append(df_nb['cab3'].value_counts().S)
    print('-'*30)
for i in range(1,8):
    count_value(i)
    
print('Total P:', sum(list_P))
print('Total S:', sum(list_S))

Yes, P and S seems to be equally distributed in each 'number_in_group' if condering the missing values. 

## <a id="3.3.2"></a>
### 3.3.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('nb within group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="nb_within_group", hue=col,palette='Accent')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- only passengers from Earth may have the number 8. 
- Numbers 1 and 2 are the most common 
- In subplots 8 to 12, number 2 and 3 become more common for the passengers from Earth without any option
- In subplots 4 to 7, number 2 and 3 become more common for the passengers from Europa travelling with a familly.
- In subplot 12, passengers from Mars having the option VRDeck have mostly the number 1. 


In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('nb within group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="nb_within_group", hue=col,palette='Accent')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- In subplot 1, passengers not transported and in destination of Trappist-1e have mostly number 1
- In subplots 1 to 12, passengers in dectination of PSOJ318.5-22 have mostly number 1, 
- In subplot 4 to 6, passengers with kids have higher number (2 to 3) whatever the destination (expecially if they have kids).
- In subplot 12, passengers from Mars having the option VRDeck have mostly the number 1. 


In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('nb within group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="nb_within_group", hue=col,palette='Accent')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

It seems that large famillies:
- In subplot 4 to 7, passengers with kids and with the letters 'B', 'A', 'G', 'C' have larger numbers.
- passengers with the letter 'F' have mostly the number 1. However, if Passengers are Transported, Cryosleep, Have kids, don't have FoodCourt and Spa their number will be higher. 
- passengers with the letter 'E' have mostly the number 1. However, if Passengers are in VIP, Cryosleep, or have kids or no options their number will be higher. 
- passengers with the letter 'T', VIP, cryosleep and kids don't have a number.
- Passenegers with the letter 'T', but without FoodCourt, Spa and VRDeck don't have a number.
- passenger with letter G or T are not in VIP

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('nb within group - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="nb_within_group", hue=col,palette='Accent')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

It seems that:
- In subplot 1 and 7, passengers transported and  or with kids have higher numbers.
- In subplot 8 and 11, passengers with option (execpted VRDeck) have higher numers.

## <a id="3.4"></a>
## 3.4: Dependency and relation with: "Occurence_First_Name'"
### 3.4.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('Occurence first name vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.countplot(data=all_data, x="occurence_FN", hue=col, palette="Dark2")   
plt.show()

- based on the first name's passenger and its occurrence it might be possible to determine its HomePlanet. All passenger with name occurence above 7 are from Earth.

## <a id="3.4.2"></a>
### 3.4.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Occurence first name - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="occurence_FN", hue=col,palette='Dark2')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

Nothing special to see here...

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Occurence first name - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="occurence_FN", hue=col,palette='Dark2')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

Passenger with a common name do not travel in VIP.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Occurence first name - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="occurence_FN", hue=col,palette='Dark2')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- the letters 'B', 'A', 'D', 'C' and 'T' in cab1 are not given to passenger with a very common first name (occurence <7).
- and so, passenger with a common name (occurrence >7) received the letters 'F', 'G' and 'E'.


In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Occurence first name - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="occurence_FN", hue=col,palette='Dark2')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

Nothing special to see here...

## <a id="3.5"></a>
## 3.5: Dependency and relation with: "Len_cab2"
### 3.5.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('Len_cab2 vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.countplot(data=all_data, x="len_cab2", hue=col, palette="cividis") 
    if i<3:
        for j in range (0,3):
            ax.bar_label(ax.containers[j])
    elif i == 3:
        for j in range (0,8):
            ax.bar_label(ax.containers[j])
    else:
        for j in range (0,2):
            ax.bar_label(ax.containers[j])
        
plt.show()

- No passenger with len_cab2 equal to 4 comes from Europa
- When len_cab2 is equal or higher than 2, the letter 'T" is not present in cab1
- When len_cab2 is equal to 4, only the letter 'F' and 'G' are not present in cab1

## <a id="3.5.2"></a>
### 3.5.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Len_cab2 - HomePlanet and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="len_cab2", hue=col,palette='cividis')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We obseve that:
- len_cab2 is larger for passenger coming from Earth and Mars.
- in subplot 2, the very large majority of passenger coming from Mars and with a len_cab2 of 3 have not been transported.
- in subplot 3, the very large majority of passenger coming from Mars and with a len_cab2 of 3 are in CryoSleep.
- in subplot 8 to 12, the very large majority of passenger coming from Mars and with a len_cab2 of 3 have at least one option.


In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Len-cab2 - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="len_cab2", hue=col,palette='cividis')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- in subplot 1, the very large majority of passenger at destination of '55 Cancri e' nd with a len_cab2 of 3 have not been transported.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('Len-cab2 - cab1 and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)  
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="len_cab2", hue=col,palette='cividis')
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

## <a id="3.6"></a>
## 3.6: Dependency and relation with: 'Number of option'
### 3.6.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('Number of option vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.countplot(data=all_data, x="nb_option", hue=col, palette="viridis") 
    
        
plt.show()

We can see that in general:
- passengers with one option only come from Earth and goes to TRAPPIST-1e.
- passengers with 5 option don't go PSOJ318-22.

## <a id="3.6.2"></a>
### 3.6.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number of option - HomePlaney and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="nb_option", hue=col,palette="viridis")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that;
- in subplot 1, there are few passengers from Mars with options who have been transported.
- in subplot 2, in general passengers in VIP have more option than other passengers.
- in subplot 2, passengers with 1 option and in VIP comes mostly from Mars
- in subplot 3, passengers in Cryosleep don't have option.
- in subplot 8 and 10, passengers from Mars who don't have Roomservice or ShoppingMall have usually 0 option.
- in subplot 9 ans 11, passengers from Europa who don't have FoodCourt or Spa or VRDeck have usually 0 option.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number of option - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="nb_option", hue=col,palette="viridis")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that the very large majority of passengers going to:
- 'TRAPPIST-1e' have beetween 1 and 4 options (subplot 1).
- PSO J318-22' a'nd '55-Cancri-e' have more than 1 option  (subplot 2).
- 'PSO J318-22' have between 1 and 4 options (subplot 3).
- '55-Cancri-e' and who have not the option Spa don't have any other opions (subplot 9).     
- 'PSO J318-22' and who don't have the option ShoppingMall don't have more than two options (subplot 10).

and in general 
- in subplot 8 to 12, the very large majority of passengers having an option have at least another one.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number of option - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="nb_option", hue=col,palette="viridis")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- in subplot 1, there are very few passenger transported with the letter B, G and T.
- in subplot 2, passenger in VIP with the letter F have at least 2 options.
- in subplot 2, the very large majority of passenger with the letter A and  have at least 3 options
- in subplot 7, familly with kids having the letter G have no option.
- in subplot 8, passengers with th letter G who don't have RoomService have no option.
- in subplot 8, passengers with th letter C who have RoomService have 4 options.
- in subplot 9, passengers with the letter B, A, G, C,T who don't have FoodCourt have zero option.
- in subplot 10, passengers with the letter G who don't have ShoppingMall have no option.
- in subplot 10, passenger with the letter C who have ShoppingMall  have four options.
- in subplot 10, passenger with the letter T who have ShoppingMall  have four options.
- in subplot 11  passengers with the letter B, A, G, C who don't have Spa have zero option.
- in subplot 12, passengers with the letter B, A, G, C who don't have VRDeck have zero option.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number of option - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="nb_option", hue=col,palette="viridis")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- There is nothing special to see here...

## <a id="3.7"></a>
## 3.7: Dependency and relation with: 'Number_in_familly'
### 3.7.1: relation with categorical variables

In [None]:
fig = plt.figure(figsize=(15,8))
fig.subplots_adjust(hspace = 0.4, wspace=0.3)

for i,col in zip (range(1,5),list_col_category):
    plt.suptitle('nb_in_familly vs categorical data')
    ax = fig.add_subplot(2,2,i)
    ax = sns.countplot(data=all_data, x="nb_in_familly", hue=col, palette="cubehelix")   
plt.show()

## <a id="3.7.2"></a>
### 3.7.2: relation with categorical and binary variables

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number in familly - HomePlanet and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[0], y="nb_in_familly", hue=col,palette="cubehelix")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

We can see that:
- in subplot 4 to 7, 'number_in_familly' depend mostly on the Homeplanet and if child are present in the familly or not.

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number in familly - Destination and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[1], y="nb_in_familly", hue=col,palette="cubehelix")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    ax.tick_params(labelrotation=30)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- There is nothing special to see here...

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number in familly - cab1 and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[2], y="nb_in_familly", hue=col,palette="cubehelix")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- There is nothing special to see here...

In [None]:
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(hspace = 0.4, wspace=0.4)
plt.suptitle('number in familly - cab3 and binary variables:',y=0.93)
for i,col in zip (range(1,13),list_col_binary):
    ax = fig.add_subplot(3,4,i)
    ax = sns.boxplot(data=all_data, x=list_col_category[3], y="nb_in_familly", hue=col,palette="cubehelix")
    ax.set_title('{}) {}'.format(i,col),fontsize=15)
    ax.set(xlabel=None)
    if (i== 4)|(i== 8)|(i==12):
        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    else:
        ax.get_legend().remove()  
plt.show()

- There is nothing special to see here...

# SUMMARY

Parameters related to **Homeplanet**:
- in proportion there are more child coming from Earth than from the other planet.
- no passenger from Earth in VIP  --ok
- based on the first name's passenger and its occurrence it might be possible to determine its HomePlanet. All passenger with name occurence above 7 are from Earth. --ok
- len_cab2 is larger for passenger coming from Earth and Mars. -- ok
- the very large majority of passenger coming from Mars and with a len_cab2 of 3 are in CryoSleep.--ok
- the very large majority of passenger coming from Mars and with a len_cab2 of 3 have at least one option.
- there are few passengers from Mars with options who have been transported.
- No passenger with len_cab2 equal to 4 comes from Europa.
- passengers from Mars who don't have Roomservice or ShoppingMall have usually 0 option. --ok
- passengers from Europa who don't have FoodCourt or Spa or VRDeck have usually 0 option. --ok
- passengers with 1 option and in VIP comes mostly from Mars.
- passengers with 1 option and not in VIP come from Earth.


Parameters related to **nb-within-group**:
- only passengers from Earth may have the number 8.
- number 2 and 3 become more common for the passengers from Earth without any option
- number 2 and 3 become more common for the passengers from Europa travelling with a familly.--ok
- passengers from Mars having the option VRDeck have mostly the number 1.
- passengers with kids have higher number (2 to 3) whatever the destination (expecially if they have kids).

Parameters related to **TRANSPORTED**:
- passengers not transported and in destination of Trappist-1e have mostly number 1
- There is no kid (below 13 years old) coming from Mars than have been transported, and only 3 from Europa.--ok
- The very large majority of passenger coming from Mars and with a len_cab2 of 3 have not been transported.--ok
- the very large majority of passenger at destination of '55 Cancri e' and with a len_cab2 of 3 have not been transported.--ok
- there are very few passenger transported with the letter B, G and T.


Parameters related to **AGE**:
- the middle age vary with cab1.
- There is no passenger younger than 25 years old coming from Mars in VIP. --ok
- There is no passenger younger than 18 years old coming from Europa in VIP.  --ok
- There is no passenger younger than 13 years with Roomservice, Foodcourt, ShoppingMall, Spa, and VRDeck.

Parameters related to **DESTINATION**:
- there is no passenger above 60 travelling toward '55 Cancri e'.
- there is no passenger above 50 IN VIP travelling toward 'PSO J318 5_22'.-- ok
- number 2 and 3 become more common for the passengers from Europa travelling with a familly.
- Passengers in destination of PSOJ318.5-22 have mostly number 1.
- passengers with 5 option don't go PSOJ318-22.
- passengers with 1 option don't go TRAPPIST-1e.

The very large majority of passengers going to:
- 'TRAPPIST-1e' have beetween 1 and 4 options (subplot 1).
- 'PSO J318-22' and '55-Cancri-e' have more than 1 option  (subplot 2).
- 'PSO J318-22' have between 1 and 4 options (subplot 3).
- '55-Cancri-e' and who have not the option Spa don't have any other opions (subplot 9).     
- 'PSO J318-22' and who don't have the option ShoppingMall don't have more than two options (subplot 10).

Parameters related to **CAB1**:
- passengers who have the letter 'G' are not VIP members.
- passengers who have a service (Transported, food, spa, etc,,,) have the letter 'T'.--ok
- passengers over 50 with a kid in their familly don't have the letter 'B'.
- passengers with kids and with the letter 'B' don't have a group number higher yhan 8000.
- Passengers with an option have the T.
- passengers with kids and with the letters 'B', 'A', 'G', 'C' have larger numbers
- passengers with the letter 'F'have mostly the number 1. However, if Passengers are Transported, Cryosleep, Have kids, don't have FoodCourt and Spa their number will be higher. 
- 'E'have mostly the number 1. However, if Passengers are in VIP, Cryosleep, or have kids or no options their number will be higher. 
- 'T', VIP, cryosleep and kids don't have a number.
- 'T', but without FoodCourt, Spa and VRDeck don't have a number.
- the letters 'B', 'A', 'D', 'C' and 'T' in cab1 are not given to passenger with a very common first name (occurence <7).
- passenger with a common name (occurrence >7) received the letters 'F', 'G' and 'E'.
- When len_cab2 is equal or higher than 2, the letter 'T" is not present in cab1
- When len_cab2 is equal to 4, only the letter 'F' and 'G' are not present in cab1.
- the very large majority of passenger with the letter A and  have at least 3 options.

Parameters related to **CAB3**:
- cab 3 is distributed 50/50 between groups.

Parameters related to **CRYO AND VIP**:
- in general passengers in VIP have more option than other passengers.
- passenger in VIP with the letter F have at least 2 options.
- passengers in Cryosleep don't have option.

Others:
- 'number_in_familly' depend mostly on the Homeplanet and if child are present in the familly or not.


## <a id="4"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">4: Replacing missing data
</p>
</div>

## <a id="4.1"></a>
## 4.1: replacing missing data based on observed relations:
**The strategy applied here is to replace the missing data based on the observations we made during the EDA. For example, we saw that no passengers coming from Earth was in VIP, here we will make sure this is still True.**

In [None]:
all_data.isnull().sum()

In [None]:
# All passenger with name occurence above 7 are from Earth.
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['occurence_FN']>7), 'HomePlanet'] = 'Earth'

# passengers in Cryosleep don't have option.
all_data.loc[(all_data['CryoSleep'].isnull())&(all_data['nb_option'] == 0), 'CryoSleep'] = True
all_data.loc[(all_data['CryoSleep'].isnull())&(all_data['nb_option'] != 0), 'CryoSleep'] = False
all_data.loc[(all_data['CryoSleep'] == True), ['RoomService','Spa','VRDeck','ShoppingMall','FoodCourt']] = 0
all_data.loc[(all_data['CryoSleep'] == True), ['Have_RoomService','Have_Spa','Have_VRDeck','Have_ShoppingMall','Have_FoodCourt']] = False

# no passenger from Earth in VIP
all_data.loc[(all_data['VIP'].isnull()) & (all_data['HomePlanet']=='Earth'), 'VIP'] = False

# Passengers in VIP younger than 18 have no option.
all_data.loc[((all_data['VIP'] == True)& (all_data['Age']<18)), ['Have_RoomService','Have_Spa','Have_VRDeck','Have_ShoppingMall','Have_FoodCourt']] = False

# There is no passenger younger than 25 years old coming from Mars in VIP.
all_data.loc[(all_data['VIP'].isnull()) & (all_data['HomePlanet'] == 'Mars')& (all_data['Age']<25), 'VIP'] = False

# There is no passenger younger than 18 years old coming from Europa in VIP.
all_data.loc[(all_data['VIP'].isnull()) & (all_data['HomePlanet'] == 'Europa')& (all_data['Age']<18), 'VIP'] = False

all_data.loc[(all_data['VIP'].isnull()) & (all_data['Age']>60)&(all_data['fam_with_baby']==True), 'VIP'] = False

# there is no passenger above 50 IN VIP travelling toward 'PSO J318 5_22'
all_data.loc[(all_data['VIP'].isnull()) & (all_data['Destination'] == 'PSO J318 5_22')&(all_data['Age'] >50), 'VIP'] = False

# All passenger with name occurence above 7 are from Earth.
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['occurence_FN']>7), 'HomePlanet'] = 'Earth'

#  passengers from Mars who don't have Roomservice or ShoppingMall have usually 0 option. 
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['RoomService']== False)&(all_data['nb_option']== 0), 'HomePlanet'] = 'Mars'
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['ShoppingMall']== False)&(all_data['nb_option']== 0), 'HomePlanet'] = 'Mars'

#  passengers from Europa who don't have FoodCourt or Spa or VRDeck have usually 0 option. 
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['FoodCourt']== False)&(all_data['nb_option']== 0), 'HomePlanet'] = 'Europa'
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['VRDeck']== False)&(all_data['nb_option']== 0), 'HomePlanet'] = 'Europa'

# len_cab2 is larger for passenger coming from Earth and Mars.
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['len_cab2']== 1), 'HomePlanet'] = 'Europa'

# There is no kid (below 13 years old) coming from Mars than have been transported, and only 3 from Europa.
# all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['Transported']== True)&(all_data['Age']< 13), 'HomePlanet'] = 'Earth'

# len_cab2 is larger for passenger coming from Earth and Mars.
all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['len_cab2']== 1), 'HomePlanet'] = 'Europa'

# There is no passenger younger than 13 years with Roomservice, Foodcourt, ShoppingMall, Spa, and VRDeck.
all_data.loc[(all_data['Age']< 13), ['RoomService','Spa','VRDeck','ShoppingMall','FoodCourt']] = 0
all_data.loc[(all_data['Age']< 13), ['Have_RoomService','Have_Spa','Have_VRDeck','Have_ShoppingMall','Have_FoodCourt']] = False

# the very large majority of passenger coming from Mars and with a len_cab2 of 3 are in CryoSleep.
all_data.loc[(all_data['HomePlanet']== 'Mars')&(all_data['CryoSleep']== True), 'len_cab2'] = 3

# The very large majority of passenger coming from Mars and with a len_cab2 of 3 have not been transported.
# all_data.loc[(all_data['HomePlanet']== 'Mars')&(all_data['Transported']== False), 'len_cab2'] = 3

# the very large majority of passenger at destination of '55 Cancri e' and with a len_cab2 of 3 have not been transported.
# all_data.loc[(all_data['Destination']== '55 Cancri e')&(all_data['Transported']== False), 'len_cab2'] = 3

# number 2 and 3 become more common for the passengers from Europa travelling with a familly.
all_data.loc[(all_data['nb_within_group']>2)&(all_data['fam_with_child']== True ), 'HomePlanet'] = 'Europa'

# - there are few passengers from Mars with options who have been transported.
# - No passenger with len_cab2 equal to 4 comes from  europa (if len_cab=2: earth or mars)
# all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['len_cab2']== 4)&(all_data['Transported']== True)&(all_data['nb_option']== 0), 'HomePlanet'] = 'Mars'
# all_data.loc[(all_data['HomePlanet'].isnull())&(all_data['len_cab2']== 4)&(all_data['Transported']== True)&(all_data['nb_option']> 0), 'HomePlanet'] = 'Earth'

# no passenger from Earth in VIP
all_data.loc[(all_data['VIP'].isnull()) & (all_data['HomePlanet']=='Earth'), 'VIP'] = False

# passengers who have the letter 'G' are not VIP members.
all_data.loc[(all_data['VIP'].isnull()) & (all_data['cab1']=='G'), 'VIP'] = False

In [None]:
all_data.isnull().sum()

## <a id="42"></a>
## 4.2: Replacing missing data with median for specific group.
**We saw that some variables depend on others. For example, age vary with cab1.** 

In [None]:
def calculate_median(col_for_selection,col_for_median, category):
    df_selection =  all_data[all_data[col_for_selection]==category]
    median = df_selection[col_for_median].median() 
    print("the {} median when {} equal to {} is {}".format(col_for_median,col_for_selection, category,median))
    return median

# replace missing value in Age based on cab1
# step1: calculate median
median_age_cab1_A = calculate_median("cab1","Age","A")
median_age_cab1_B = calculate_median("cab1","Age","B")
median_age_cab1_C = calculate_median("cab1","Age","C")
median_age_cab1_D = calculate_median("cab1","Age","D")
median_age_cab1_E = calculate_median("cab1","Age","E")
median_age_cab1_F = calculate_median("cab1","Age","F")
median_age_cab1_G = calculate_median("cab1","Age","G")
median_age_cab1_T = calculate_median("cab1","Age","T")

In [None]:
# step2: replace missing value with median
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'A'), 'Age'] = median_age_cab1_A
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'B'), 'Age'] = median_age_cab1_B
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'C'), 'Age'] = median_age_cab1_C
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'D'), 'Age'] = median_age_cab1_D
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'E'), 'Age'] = median_age_cab1_E
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'F'), 'Age'] = median_age_cab1_F
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'G'), 'Age'] = median_age_cab1_G
all_data.loc[(all_data['Age'].isnull()) & (all_data['cab1'] == 'T'), 'Age'] = median_age_cab1_T

In [None]:
# replace missing value in occurence_FN based on HomePlanet
# step1: calculate median
median_occurence_FN_earth = calculate_median("HomePlanet","occurence_FN","Earth")
median_occurence_FN_mars  = calculate_median("HomePlanet","occurence_FN","Mars")
median_occurence_FN_europa = calculate_median("HomePlanet","occurence_FN","Europa")

In [None]:
# step 2: replace nan values with median
all_data.loc[(all_data['occurence_FN'].isnull()) & (all_data['HomePlanet'] == 'Earth'), 'occurence_FN'] = median_occurence_FN_earth
all_data.loc[(all_data['occurence_FN'].isnull()) & (all_data['HomePlanet'] == 'Mars'), 'occurence_FN'] = median_occurence_FN_mars
all_data.loc[(all_data['occurence_FN'].isnull()) & (all_data['HomePlanet'] == 'Europa'), 'occurence_FN'] = median_occurence_FN_europa

In [None]:
# 'number_in_familly' depend mostly on the Homeplanet and if child are present in the familly or not.
def calculate_median2(col_for_selection,col_for_median, category,child):
    df_selection =  all_data[(all_data[col_for_selection]==category)&(all_data['fam_with_child']== child)]
    median = df_selection[col_for_median].median() 
    print("the {} median when {} equal to {} and with a child(s) in familly is {}".format(col_for_median,col_for_selection, category,median))
    return median

median_number_Familly_earth_child_true= calculate_median2("HomePlanet","nb_in_familly","Earth",True)
median_number_Familly_earth_child_false= calculate_median2("HomePlanet","nb_in_familly","Earth",False)
median_number_Familly_mars_child_true= calculate_median2("HomePlanet","nb_in_familly","Mars",True)
median_number_Familly_mars_child_false= calculate_median2("HomePlanet","nb_in_familly","Mars",False)
median_number_Familly_europa_child_true= calculate_median2("HomePlanet","nb_in_familly","Europa",True)
median_number_Familly_europa_child_false= calculate_median2("HomePlanet","nb_in_familly","Europa",False)


In [None]:
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Earth') & (all_data['fam_with_child'] == True), 'nb_in_familly'] = median_number_Familly_earth_child_true
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Earth') & (all_data['fam_with_child'] == False), 'nb_in_familly'] = median_number_Familly_earth_child_false
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Europa')& (all_data['fam_with_child'] == True), 'nb_in_familly'] =  median_number_Familly_europa_child_true
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Europa')& (all_data['fam_with_child'] == False), 'nb_in_familly'] =  median_number_Familly_europa_child_false
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Mars')& (all_data['fam_with_child'] == True), 'nb_in_familly'] =  median_number_Familly_mars_child_true
all_data.loc[(all_data['nb_in_familly'].isnull()) & (all_data['HomePlanet'] == 'Mars')& (all_data['fam_with_child'] == False), 'nb_in_familly'] =  median_number_Familly_europa_child_false

In [None]:
# passengers who have a service (Transported, food, spa, etc,,,) have the letter 'T' and between 3 or 4 options
all_data.loc[(all_data['RoomService'].isnull())&(all_data['cab1']== 'T')&(all_data['nb_option']== 3), 'RoomService'] = all_data['RoomService'].median() 
all_data.loc[(all_data['FoodCourt'].isnull())&(all_data['cab1']== 'T')&(all_data['nb_option']== 3), 'FoodCourt'] = all_data['FoodCourt'].median() 
all_data.loc[(all_data['ShoppingMall'].isnull())&(all_data['cab1']== 'T')&(all_data['nb_option']== 3), 'ShoppingMall'] = all_data['ShoppingMall'].median() 
all_data.loc[(all_data['Spa'].isnull())&(all_data['cab1']== 'T')&(all_data['nb_option']== 3), 'Spa'] = all_data['Spa'].median() 
all_data.loc[(all_data['VRDeck'].isnull())&(all_data['cab1']== 'T')&(all_data['nb_option']== 3), 'VRDeck'] = all_data['VRDeck'].median() 

# if nb_within_group equal 2 and cab3 = S then have VRdech 
all_data.loc[(all_data['VRDeck'].isnull())&(all_data['cab3']== 'S')&(all_data['nb_within_group']== 2), 'VRDeck'] = all_data['VRDeck'].median() 

# if letter 'G' or 'E' nb_within_group' =2 have_RoomService=False
all_data.loc[(all_data['RoomService'].isnull())&((all_data['cab1']== 'G')|(all_data['cab1']== 'E'))&(all_data['nb_within_group']== 2), 'RoomService'] = 0 
# if letter 'F','G' or 'E' and nb_within_group' =2 have_FoodCourt=False
all_data.loc[(all_data['FoodCourt'].isnull())&((all_data['cab1']== 'F')|(all_data['cab1']== 'E')|(all_data['cab1']== 'G'))&(all_data['nb_within_group']== 2), 'FoodCourt'] = 0 
# if letter 'G' or 'E' and nb_within_group' =2 have_ShoppingMall=False
all_data.loc[(all_data['ShoppingMall'].isnull())&((all_data['cab1']== 'E')|(all_data['cab1']== 'G'))&(all_data['nb_within_group']== 2), 'ShoppingMall'] = 0 
# if letter 'F' or 'E' and nb_within_group' =2 have_Spa=False
all_data.loc[(all_data['Spa'].isnull())&((all_data['cab1']== 'E')|(all_data['cab1']== 'F'))&(all_data['nb_within_group']== 2), 'Spa'] = 0 
# if letter 'G' or 'E' and nb_within_group' =2 have_VRDeck=False
all_data.loc[(all_data['VRDeck'].isnull())&((all_data['cab1']== 'E')|(all_data['cab1']== 'G'))&(all_data['nb_within_group']== 2), 'VRDeck'] = 0 

# if HomePlanet = Earth, nb_within_group' =2 have_RoomService=False

# if HomePlanet = Earth, nb_within_group' =2 have_FoodCourt=False
all_data.loc[(all_data['FoodCourt'].isnull())&(all_data['HomePlanet']== 'Earth')&(all_data['nb_within_group']== 2), 'FoodCourt'] = 0 
# if HomePlanet = Earth, nb_within_group' =2 have_RoomService=False
all_data.loc[(all_data['RoomService'].isnull())&(all_data['HomePlanet']== 'Earth')&(all_data['nb_within_group']== 2), 'RoomService'] = 0 
# if HomePlanet = Earth, nb_within_group' =2 have_ShoppingMall=False
all_data.loc[(all_data['ShoppingMall'].isnull())&(all_data['HomePlanet']== 'Earth')&(all_data['nb_within_group']== 2), 'ShoppingMall'] = 0 
# if HomePlanet = Earth, nb_within_group' =2 have_Spa=False
all_data.loc[(all_data['Spa'].isnull())&(all_data['HomePlanet']== 'Earth')&(all_data['nb_within_group']== 2), 'Spa'] = 0 
# if HomePlanet = Earth or Mars, nb_within_group' =2 have_VRDeck=False
all_data.loc[(all_data['VRDeck'].isnull())&(all_data['HomePlanet']== 'Earth')&(all_data['nb_within_group']== 2), 'VRDeck'] = 0 
all_data.loc[(all_data['VRDeck'].isnull())&(all_data['HomePlanet']== 'Mars')&(all_data['nb_within_group']== 2), 'VRDeck'] = 0 


In [None]:
all_data.isnull().sum()

## <a id="4.3"></a>
## 4.3: Replacing missing value with median calculated on the entier population

In [None]:
all_data["HomePlanet"]=all_data["HomePlanet"].fillna(all_data["HomePlanet"].mode().iloc[0])
all_data["Destination"]=all_data["Destination"].fillna(all_data["Destination"].mode().iloc[0])
all_data["cab1"]=all_data["cab1"].fillna(all_data["cab1"].mode().iloc[0])
all_data["cab3"]=all_data["cab3"].fillna(all_data["cab3"].mode().iloc[0])

# filling missing value using fillna()  
all_data['VIP']=all_data['VIP'].fillna(False)

# filling missing values with median:
all_data.loc[all_data['Age'].isnull(), 'Age'] = all_data['Age'].median()
all_data.loc[all_data['nb_in_familly'].isnull(), 'nb_in_familly'] = all_data['nb_in_familly'].median()
all_data.loc[all_data['len_cab2'].isnull(), 'len_cab2'] = all_data['len_cab2'].median()
all_data.loc[all_data['occurence_FN'].isnull(), 'occurence_FN'] = all_data['occurence_FN'].median()

In [None]:
all_data.isnull().sum()

## <a id="4.4"></a>
## 4.4: Add one last new feature

In [None]:
all_data['TotalBilled'] = all_data['RoomService'] + all_data['FoodCourt'] + all_data['ShoppingMall'] + all_data['Spa'] + all_data['VRDeck']

## <a id="5"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">5: Correlations between variables
</p>
</div>

**Now that there is no missing value, let's have a look at the correlations between numeric variables.**

In [None]:
corr = all_data.corr(method='pearson')

fig, axes = plt.subplots(1,figsize=(18,18))
ax0 = plt.subplot(1,1,1)
sns.heatmap(corr,annot=True,linewidths=.5, annot_kws={"size": 10},vmin=-1.0, vmax=1.0,square=True,cbar=True)
# bottom, top = ax0.get_ylim()
# ax0.set_ylim(bottom + 0.5, top - 0.5)
ax0.set_title('correlations between numerical variables',size=18,y=1.05)
ax0.set_yticklabels(ax0.get_yticklabels(), rotation=0,size=14) 
ax0.set_xticklabels(ax0.get_xticklabels(), rotation=90,size=14) 
plt.show()

We can observed a correlation on1 between:
- 'nb_kid' and 'nb_teenager'.

In [None]:
all_data =all_data.drop(['nb_kid'],axis=1)

## <a id="6"></a>
<div style="
           border-radius:50px;
           background-color:#7ca4cd;
           font-size:200%;
           font-family:Arial;
           letter-spacing:0.10px">
<p style="padding: 10px;
          color:white;
          text-align:center;">6: Machine Learning
</p>
</div>

In [None]:
from sklearn.model_selection import train_test_split

# classification model
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import ExtraTreesClassifier
from xgboost  import XGBClassifier

# scale data
from sklearn.preprocessing import StandardScaler

# hyperparameter tunning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import optuna

# model evaluation
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix


In [None]:
# split all_data into training and testing data set
train_data = all_data[all_data['df']=='train']
print('shape train_data: ',train_data.shape)
test_data = all_data[all_data['df']=='test']
print('shape test_data: ',test_data.shape)

In [None]:
# merge train data with target
train_data = pd.merge(train_data, target, left_on="index", right_index=True, how="left", sort=False)
train_data.tail(2)

In [None]:
test_data.tail(2)

## <a id="6.1"></a>
## 6.1: Data preprocessing

In [None]:
#vHere we will convert true/false to 1/0
train_data[list_col_binary]= train_data[list_col_binary].astype(int)
test_data[list_col_binary]= test_data[list_col_binary].astype(int)

# drop unwanted columns
train_data = train_data.drop(['first_name','last_name','index','df'],axis=1)
test_data = test_data.drop(['first_name','last_name','index','df'],axis=1)

# define explanatory data and target
target = train_data['Transported']
X = train_data.drop(['Transported'],axis=1)

# get dummies
X = pd.get_dummies(X,drop_first = True)
test_data = pd.get_dummies(test_data,drop_first = True)

In [None]:
# split data into a training and test dataset
X_train,X_test,y_train,y_test=train_test_split(X,target,test_size=0.2,random_state=42)

# scale the data
scaler = StandardScaler()

# fit and transform "x_train"
X_train = scaler.fit_transform(X_train)
# transform test
X_test = scaler.transform(X_test)

## <a id="6.2"></a>
## 6.2: Model selection
The idea here is to run several models using different claasification techniques and select the bests.

In [None]:
# create empty list to store the results
dic_result_simple_model = []

# create function to do cross validation test, calculate score and plot results
def test_simple_model(model,name,X_train,X_test,y_train,y_test):
    #  training set with 5-fold cross-validation
    cv_scores_train = cross_val_score(model,X_train, y_train,cv=5,scoring = 'accuracy')
    # Compute accuracy on the training set with 5-fold cross-validation
    cv_scores_test = cross_val_score(model,X_test, y_test,cv=5,scoring = 'accuracy')
    print('mean scores on training set: {:2f}, and testing set: {:2f}'.format(np.mean(cv_scores_train),np.mean(cv_scores_test)))
    # calculate mean score
    model_mean_train = np.mean(cv_scores_train)
    model_mean_test = np.mean(cv_scores_test)
    
    # get prediction for confusion matrix
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    
    # plot results CV
    fig = plt.figure(figsize=(15,3))
    fig.subplots_adjust(hspace=0.4,wspace=0.3)
    ax0 = fig.add_subplot(1,2,1)
    ax0 = plt.plot(cv_scores_train,'go-',label='CV score on training set')
    ax0 = plt.plot(cv_scores_test,'ro-',label='CV score on testing set')
    ax0 = plt.xlabel('nb of fold cross-validation')
    ax0 = plt.ylabel('accuracy')
    ax0 = plt.legend() 
    
    # plot confusion matrix
    ax1 = fig.add_subplot(1,2,2)
    plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Blues,ax=ax1);  
    accuracy = accuracy_score(y_test,pred)
    recall = recall_score(y_test,pred)
    precision = precision_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    # print scores
    print('accuracy: ', accuracy, '  |  recall: ',recall, '  |  precision: ', precision, '  |  f1: ', f1)
    
    # add result to dictionary
    dic_result_simple_model.append({'Model': name,'cv_scores_train': model_mean_train, 'cv_scores_test': model_mean_test,
                                   'Accuracy': accuracy, 'Recall': recall, 'Precision': precision, 'F1':f1})    
    plt.show()
    
def highlight_max(s):
    # Get 4 largest values of the column
    is_large = s.nlargest(4).values
    # Apply style is the current value is among the 4 biggest values
    return ['background-color: lightgreen' if v in is_large else '' for v in s]

In [None]:
# clf_svm = svm.SVC()
# test_simple_model(clf_svm,'SVC', X_train, X_test, y_train, y_test)

In [None]:
# clf_SGD = SGDClassifier()
# test_simple_model(clf_SGD,'SGDC', X_train, X_test, y_train, y_test)

In [None]:
# clf_MLP = MLPClassifier()
# test_simple_model(clf_MLP,'MLP', X_train, X_test, y_train, y_test)

In [None]:
clf_log_reg = LogisticRegression()
test_simple_model(clf_log_reg,'LOG_REG', X_train, X_test, y_train, y_test)

In [None]:
clf_Perc = Perceptron()
test_simple_model(clf_Perc,'PERC', X_train, X_test, y_train, y_test)

In [None]:
# clf_CVcv = CalibratedClassifierCV()
# test_simple_model(clf_CVcv, 'CALI_CV', X_train, X_test, y_train, y_test)

In [None]:
# clf_pas_agg_cla = PassiveAggressiveClassifier()
# test_simple_model(clf_pas_agg_cla,'PAS_AGG', X_train, X_test, y_train, y_test)

In [None]:
# clf_rf = RandomForestClassifier()
# test_simple_model(clf_rf,'RF', X_train, X_test, y_train, y_test)

In [None]:
# clf_label_prop =  LabelPropagation()
# test_simple_model(clf_label_prop,'LABEL_PROP', X_train, X_test, y_train, y_test)

In [None]:
# clf_label_spread =  LabelSpreading()
# test_simple_model(clf_label_spread,'LABEL_SPEAD', X_train, X_test, y_train, y_test)

In [None]:
clf_gbc = GradientBoostingClassifier()
test_simple_model(clf_gbc,'GBC', X_train, X_test, y_train, y_test)

In [None]:
# clf_quad_drisc_anal = QuadraticDiscriminantAnalysis()
# test_simple_model(clf_quad_drisc_anal,'QUAD_DRISC', X_train, X_test, y_train, y_test)

In [None]:
clf_hist_grad_boost = HistGradientBoostingClassifier()
test_simple_model(clf_hist_grad_boost,'HIST_GRAD_BOOST', X_train, X_test, y_train, y_test)

In [None]:
# clf_ridgecv = RidgeClassifierCV()
# test_simple_model(clf_ridgecv,'RIDGE_CV', X_train, X_test, y_train, y_test)

In [None]:
# clf_ridge = RidgeClassifier()
# test_simple_model(clf_ridge,'RIDGE', X_train, X_test, y_train, y_test)

In [None]:
# clf_AdaBoost = AdaBoostClassifier()
# test_simple_model(clf_AdaBoost,'ADABOOST', X_train, X_test, y_train, y_test)

In [None]:
# clf_ExtraTrees = ExtraTreesClassifier()
# test_simple_model(clf_ExtraTrees,'EXTRA_TREES', X_train, X_test, y_train, y_test)

In [None]:
# clf_knn = KNeighborsClassifier()
# test_simple_model(clf_knn,'KNN', X_train, X_test, y_train, y_test)

In [None]:
# clf_BernoulliNB = BernoulliNB()
# test_simple_model(clf_BernoulliNB,'BERNOULLI', X_train, X_test, y_train, y_test)

In [None]:
# clf_linear_dis_ana = LinearDiscriminantAnalysis()
# test_simple_model(clf_linear_dis_ana,'LINEAR_DIS', X_train, X_test, y_train, y_test)

In [None]:
# clf_Gaussian = GaussianNB()
# test_simple_model(clf_Gaussian,'GAUSS', X_train, X_test, y_train, y_test)

In [None]:
# clf_dec_tree_clas = DecisionTreeClassifier(random_state=0)
# test_simple_model(clf_dec_tree_clas,'DEC_TREE_CLASS', X_train, X_test, y_train, y_test)

In [None]:
clf_XGB = XGBClassifier(objective='binary:logistic')
test_simple_model(clf_XGB,'XGB', X_train, X_test, y_train, y_test)

In [None]:
result_simple_model = pd.DataFrame(dic_result_simple_model).set_index('Model').sort_values(by=['Accuracy'], ascending=False)
result_simple_model.style.apply(highlight_max)

We are looking for model where cv_train_test = cv_test_test (not over or under fitting) and with a high accuracy. So, we can select:
- hist grad boost, 
- XGB, 
- GBC,
- and log_reg. 

In [None]:
# select the four best
best_result_simple_model =result_simple_model[:4]

## <a id="6.3"></a>
## 6.3: Feature selection

In [None]:
def plot_feature_importance(clf,name):
    #Create arrays from feature importance and feature names
    feature_importance = np.array(clf.feature_importances_)
    feature_names = np.array(X.columns)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(15,10))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(' {} FEATURE IMPORTANCE'.format(name))
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.show()
    return fi_df
    

def feature_selection(model, list_variable,list_to_store_result):
    # select all the variable base on threshold:
    X_1 = X[list_variable]
    # create new set of data for training and testing
    X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(X_1,target,test_size=0.2,random_state=0)
    X_train_1 = scaler.fit_transform(X_train_1)
    X_test_1 = scaler.transform(X_test_1)
    #  training set with 5-fold cross-validation
    cv_scores_train = cross_val_score(model,X_train_1, y_train_1,cv=5,scoring = 'accuracy')
    # Compute accuracy on the training set with 5-fold cross-validation
    cv_scores_test = cross_val_score(model,X_test_1, y_test_1,cv=5,scoring = 'accuracy')    
    model_mean_train = np.mean(cv_scores_train)
    model_mean_test = np.mean(cv_scores_test)
    list_to_store_result.append({'cv_scores_train': model_mean_train, 'cv_scores_test': model_mean_test})   

# empty dictionary to store results
dic_result_simple_model = []

## <a id="6.3.1"></a>
### 6.3.1: Feature importance for GradientBoostingClassifier

In [None]:
fi_df_gbc = plot_feature_importance(clf_gbc,'GradientBoostingClassifier')

In [None]:
list_feature_names_gbc = fi_df_gbc['feature_names'].tolist()

list_result_bgc = []

for i in range(1,len(list_feature_names_gbc)):
    list_variables = list_feature_names_gbc[:i]
    feature_selection(clf_gbc, list_variables,list_result_bgc)

In [None]:
list_result_bgc = pd.DataFrame(list_result_bgc)
list_result_bgc.plot()

In [None]:
list_result_bgc.style.apply(highlight_max)

15 features seems to give the best results...

In [None]:
test_data_gbc = test_data[list_feature_names_gbc[:15]]
X_gbc = X[list_feature_names_gbc[:15]]

# create new set of data for training and testing
X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(X_gbc,target,test_size=0.2,random_state=0)
X_train_1 = scaler.fit_transform(X_train_1)
X_test_1 = scaler.transform(X_test_1)
    
# create new set of data for training and testing
test_data_gbc = scaler.transform(test_data_gbc)
# test model
clf_gbc.fit(X_train_1,y_train_1)
pred = clf_gbc.predict(test_data_gbc)

submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': pred})
submission['Transported']=submission['Transported'].astype(bool)
submission.to_csv("submission_gbc4.csv", index=False)

Score: 0.50339
Score: 0.50081

## <a id="6.3.2"></a>
### 6.3.2: Feature importance for XGBClassifier()

In [None]:
fi_df_xgb = plot_feature_importance(clf_XGB,'XGBClassifier')

In [None]:
list_feature_names_xgb = fi_df_xgb['feature_names'].tolist()

list_result_XGB = []

for i in range(1,len(list_feature_names_xgb)):
    list_variables = list_feature_names_xgb[:i]
    feature_selection(clf_XGB, list_variables,list_result_XGB)
    
list_result_XGB = pd.DataFrame(list_result_XGB)
list_result_XGB.plot()

In [None]:
list_result_XGB.style.apply(highlight_max)

22 features seems to be the best.

In [None]:
test_data_xgb = test_data[list_feature_names_xgb[:22]]
X_xgb = X[list_feature_names_xgb[:22]]

# create new set of data for training and testing
X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(X_xgb,target,test_size=0.2,random_state=0)
X_train_1 = scaler.fit_transform(X_train_1)
X_test_1 = scaler.transform(X_test_1)
    
# create new set of data for training and testing
test_data_xgb = scaler.transform(test_data_xgb)
# test model
clf_XGB.fit(X_train_1,y_train_1)
pred = clf_XGB.predict(test_data_xgb)


submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': pred})
submission['Transported']=submission['Transported'].astype(bool)
submission.to_csv("submission_xgb4.csv", index=False)

Score: 0.50619

Score: 0.50268

## <a id="6.3.3"></a>
### 6.3.3: Feature importance for log_reg()

In [None]:
# list_feature_names = X.columns.tolist()

# list_result_log_reg = []

# for i in range(1,len(list_feature_names)):
#     list_variables = list_feature_names[:i]
#     feature_selection2(clf_log_reg, list_variables,list_result_log_reg)
    
# list_result_log_reg = pd.DataFrame(list_result_log_reg)
# list_result_log_reg.plot()

In [None]:
# list_result_log_reg.style.apply(highlight_max)

34 features seems the best in this case.

## <a id="6.3.4"></a>
### 6.3.4: Feature importance for Hist_Grad_Boost Classifier()

In [None]:
list_feature_names = X.columns.tolist()

list_result_hist = []

for i in range(1,len(list_feature_names)):
    list_variables = list_feature_names[:i]
    feature_selection(clf_hist_grad_boost, list_variables,list_result_hist)
    
list_result_hist = pd.DataFrame(list_result_hist)
list_result_hist.plot()

In [None]:
list_result_hist.style.apply(highlight_max)

37 features seems to be the best

## <a id="6.4"></a>
## 6.4: Hyperparamter tunning
### 6.4.1: HistGradientBoostingClassifier

In [None]:
# def objective(trial,df_train = X,target=target):    
#     train_x, test_x, train_y, test_y = train_test_split(df_train, target, test_size=0.2,random_state=42)
#     # scale the data
#     scaler = StandardScaler()
#     # fit and transform "x_train"
#     train_x = scaler.fit_transform(train_x)
#     # transform test
#     test_x = scaler.transform(test_x)
    
#     params = {
#         'l2_regularization': trial.suggest_loguniform('l2_regularization',10e-10,10.0),
#         'early_stopping': trial.suggest_categorical('early_stopping', ['False']),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
#         'max_iter': trial.suggest_categorical('max_iter', [1000]),
#         'max_depth': trial.suggest_int('max_depth', 2,100),
#         'max_bins': trial.suggest_int('max_bins', 5,255),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 15,25),
#         'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
#     }

#     model = HistGradientBoostingClassifier(**params)
#     model.fit(train_x, train_y)
#     pred = model.predict(test_x)
#     accuracy = accuracy_score(y_test,pred)
    
#     return accuracy


In [None]:
# %%time
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=300)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

0.8257619321449109.

Number of finished trials: 300
Best trial: {'l2_regularization': 3.636737375134583e-07, 'early_stopping': 'False', 'learning_rate': 0.0581246392997384, 'max_iter': 1000, 'max_depth': 54, 'max_bins': 143, 'min_samples_leaf': 16, 'max_leaf_nodes': 21}

In [None]:
# train_x, test_x, train_y, test_y = train_test_split(X, target, test_size=0.2,random_state=42)

# # scale the data
# scaler = StandardScaler()
# # fit and transform "x_train"
# train_x = scaler.fit_transform(train_x)
# # transform test
# test_x = scaler.transform(test_x)
# test_data_scale = scaler.transform(test_data)

# params = {
#     'l2_regularization' : 3.636737375134583e-07,
#     'early_stopping' : 'False',
#     'learning_rate' : 0.0581246392997384,
#     'max_iter'  : 1000,
#     'max_depth' : 54,
#     'max_bins'  : 143,
#     'min_samples_leaf' : 16,
#     'max_leaf_nodes'  : 21,
# }

# model = HistGradientBoostingClassifier(**params)
# model.fit(train_x, train_y)
# pred = model.predict(test_data_scale)

# submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': pred})
# submission['Transported']=submission['Transported'].astype(bool)
# submission.to_csv("submission_hist.csv", index=False)

Score: 0.50526

In [None]:
# clf_bagging = BaggingClassifier(HistGradientBoostingClassifier(**params))
# clf_bagging.fit(train_x, train_y)
# pred = clf_bagging.predict(test_data_scale)

# submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': pred})
# submission['Transported']=submission['Transported'].astype(bool)
# submission.to_csv("submission_bag_hist.csv", index=False)

Score: 0.50268

### 6.4.2: XGBClassifier

In [None]:
# test_data_xgb = test_data[list_feature_names_xgb[:22]]
# X_xgb = X[list_feature_names_xgb[:22]]

# def objective(trial,df_train = X_xgb,target=target):    
#     train_x, test_x, train_y, test_y = train_test_split(df_train, target, test_size=0.2,random_state=42)
#     # scale the data
#     scaler = StandardScaler()
#     # fit and transform "x_train"
#     train_x = scaler.fit_transform(train_x)
#     # transform test
#     test_x = scaler.transform(test_x)
    
#     params = {
#         'max_depth': trial.suggest_int('max_depth', 2, 15),
#         'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.05),
#         'n_estimators': trial.suggest_int('n_estimators', 600, 900,50),
#         'eta': trial.suggest_loguniform('learning_rate', 0.001,0.1),
#         'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
#         'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
#         'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
#     }

#     model = XGBClassifier(**params,objective='binary:logistic')
#     model.fit(train_x, train_y)
#     pred = model.predict(test_x)
#     accuracy = accuracy_score(y_test,pred)
    
#     return accuracy


In [None]:
# %%time
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials= 300)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

 0.8297872340425532.
 
Number of finished trials: 300
Best trial: {'max_depth': 8, 'subsample': 0.9, 'n_estimators': 600, 'learning_rate': 0.08813054449895896, 'reg_alpha': 11, 'reg_lambda': 73, 'min_child_weight': 15, 'colsample_bytree': 0.929698611129216}
CPU times: user 2h 10min 58s, sys: 43.2 s, total: 2h 11min 42s
Wall time: 33min 45s