In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import cufflinks as cf
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode

import seaborn as sns
import plotly.figure_factory as ff 
import matplotlib.pyplot as plt
from tqdm import tqdm

#machine learning libraries:
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score, train_test_split
from sklearn.preprocessing  import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import warnings
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### <h1 align="center">Thanks for reading</h1>
<h2 align="center" style='color:red' > If you like the notebook or learned something please upvote! </h2>
<b><li>You can also the beautiful notebook from the titanic competition which helped me a lot with the plot idea</li></b>
<ul>
<li><b><a href='https://www.kaggle.com/code/alaasedeeq/predicting-the-survival-of-titanic-top-6'>Prediction of Heart Disease (Machine Learning)</a>

</ul>

# File and Data Field Descriptions 


train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid for special VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

Name - The first and last names of the passenger.

Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

sample_submission.csv - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False

# CheckList
Shape Analysis : 

    - target variable : 'Transported'
    - shape of your dataset : row : 8693, columns : 14
    - Features types : bool(1), float64(6), object(7)

Idea :

    - Group by RoomService, FoodCourt, ShoppingMall, Spa, VRDeck to a variable "Total_billed"
    - Split Cabin data to get deck, num_cabin, side

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print(f'Number of rows/columns : {df_train.shape}')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
print(f'Number of rows/columns : {df_test.shape}')
df_test.head()

In [None]:
df = pd.concat([df_train,df_test])

#make a copy of the original data
train_df_orig = df_train.copy()
test_df_orig = df_test.copy()


In [None]:
df.info()

In [None]:
%matplotlib inline
msno.matrix(df_train)

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

missing_data(df_train)

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

unique_values(df_train)

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

most_frequent_values(df_train)

In [None]:
fig = px.pie(df,values=df['Transported'].value_counts(), names=df['Transported'].value_counts().index.tolist(), color_discrete_sequence=px.colors.sequential.RdBu, title='Transported Pie Chart')
fig.show()

In [None]:
#describe our data
df[df.select_dtypes(exclude='object').columns].describe().\
style.background_gradient(axis=1,cmap=sns.light_palette('green', as_cmap=True))

In [None]:
#lets see the correlation between columns and target column
corr = df_train.corr()
corr['Transported'].sort_values(ascending=False)[1:].to_frame()\
.style.background_gradient(axis=1,cmap=sns.light_palette('green', as_cmap=True))

In [None]:
#lets create a dataframe for the numeric columns with high skewness

skewness = pd.DataFrame()

num_cols = []
for col in df.select_dtypes(exclude='object'):
    if col != 'Transported':
        num_cols.append(col)

skewness[['Positive Columns','Skewness(+v)']] = df[num_cols].skew().sort_values(ascending=False).reset_index()
skewness[['Negative Columns','Skewness(-v)']] = df[num_cols].skew().sort_values(ascending=True).reset_index()

skewness.columns = pd.MultiIndex.from_tuples([('Positive Skewness', 'Columns'), ('Positive Skewness', 'Skewness'),
                                              ('Negative Skewness', 'Columns'), ('Negative Skewness', 'Skewness')])
skewness

In [None]:
#lets take a look to the shape of columns
pd.set_option("display.float", "{:.4f}".format)
df.drop('Transported',axis =1).skew().to_frame().rename(columns={0:'Skewness'}).sort_values('Skewness')

In [None]:
#Visualize columns have highest Skewness
fig, axes = plt.subplots(1,3, figsize=(20, 8));
fig.suptitle('Highest Skewness', fontsize=20);

sns.kdeplot(df_train['ShoppingMall'], ax=axes[0],hue=df_train['Transported']);
sns.kdeplot(df_train['VRDeck'], ax=axes[1],hue=df_train['Transported']);
sns.kdeplot(df_train['Spa'], ax=axes[2],hue=df_train['Transported']);

<h1>Conclusions</h1><br>
<li>We got some missing values in globally all of this columns except "PassengerId" and "Transported" --> Around 2% of missing value
<li>Transported column have a higher correlation with:
    <ul>
        <li>FoodCourt <b> 0.046566</b>
        <li>ShoppingMall <b> 0.010141</b>
    </ul>
<li>We have some Columns with a high Skewness:
    <ul>
        <li>ShoppingMall <b> 11.0091</b>
        <li>VRDeck <b> 8.0598</b>
        <li>Spa <b> 7.6532</b>   
    </ul>

# I - BASIC EDA 
<a id="Phase I : Basic EDA"></a>

## Background Analysis

In [None]:
# Take a look on the numerical distributions : float type
sns.set_style('whitegrid')
fig, axes = plt.subplots(2,3, figsize=(18, 8));
plt.subplots_adjust(hspace = 0.7, wspace=0.2)
fig.suptitle('Numerical Float Distributions', fontsize=20)

# Take a look on the numerical distributions
a = len(df_train.select_dtypes('float').columns)  # number of rows

for i,col in zip(range(a),df_train.select_dtypes('float')):
    sns.kdeplot(df_train[col], ax=axes[i//3][i%3], fill= True);
    axes[i//3][i%3].set_title(col+' Distribution')

## A - Look on total_bill

In [None]:
df_train['total_bill'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']

In [None]:
fig = px.scatter(df_train.fillna('null'), x="total_bill", y="Age",color="HomePlanet", hover_name='Cabin',text='Transported')
fig.show()

In [None]:
fig = px.scatter(df_train.fillna('null'), x="total_bill", y="Age",color='HomePlanet', hover_data = df_train.columns,text='VIP', symbol = 'Transported')
fig.show()

In [None]:
fig = px.sunburst(df_train.dropna(), path=['HomePlanet', 'Destination'], values='Age',
                  color='total_bill', title='HomePlanet - Destination by total_bill')
fig.show()

In [None]:
fig = px.scatter(df_train.dropna(), x="Age", y="total_bill", size="RoomService", color='HomePlanet',facet_row="Transported",
                 hover_data=df_train.columns, log_x=False, size_max=60)
fig.show()

In [None]:
fig = px.scatter(df_train.dropna(), x="Age", y="total_bill", size="RoomService", color='HomePlanet',facet_row="Transported", facet_col="Destination",
                 hover_data=df_train.columns, log_x=False, size_max=60)
fig.show()

In [None]:
fig = px.box(df_train.fillna(0), x="HomePlanet", y="total_bill", color="Transported", notched=True)
fig.show()

**Hypothesis : We can note that passengers who have been transported had a total bill lowest than the non transported passengers especially from the people with the Europa home planet**

## B - Categorical Variable


In [None]:
# new feature :
df_train['total_bill_bins'] = pd.cut(df_train['total_bill'],bins=6, labels=False)
df_train['total_bill_bins'].value_counts()

In [None]:
sns.scatterplot(x='total_bill_bins',y='total_bill',data=df_train)

In [None]:
dico_bins_bill = {
    0 : '<= 5000',
    1 : '<= 12000',
    2 : "<= 17000",
    3 : "<= 25000",
    4 : "<= 30000",
    5 : "<= 35000"
}

df_train['total_bill_bins'] = df_train['total_bill_bins'].map(dico_bins_bill)

In [None]:
def visualisation_data(dataset,xlabel):
    
    #Visualization on your Data

    #  plot Numerical Data

    a = len(dataset.select_dtypes(include='object').columns)  # number of rows
    b = 2  # number of columns
    c = 1  # initialize plot counter


    fig = plt.figure(figsize=(22,28))

    for i in tqdm(dataset.select_dtypes(exclude='float')):
        if i != 'total_bill_bins':
            plt.subplot(a, b, c)
        #plt.title('{} (heatmap), subplot: {}{}{}'.format(i, a, b, c))
            plt.xlabel(xlabel)
            sns.heatmap(pd.crosstab(df_train['total_bill_bins'], dataset[i]), annot=True, fmt='d')
            c = c + 1

            plt.subplot(a, b, c)
        #plt.title('{} (scatter), subplot: {}{}{}'.format(i, a, b, c))
            plt.xlabel(xlabel)
            sns.scatterplot(x=xlabel, y="total_bill", hue=i, alpha=.5, palette="muted", data=dataset)
            c = c + 1
    
    plt.show()

In [None]:
visu_df = df_train.loc[:,['HomePlanet','CryoSleep','Destination','VIP','Transported','Age','total_bill','total_bill_bins']]
visu_df.info()

In [None]:
visualisation_data(visu_df,'Age')

## C - Cabin

We will extract some information about the cabin area like 

deck / num of cabin / side' 

*G/734/S --> Cabin  deck/num/side*

In [None]:
# Get info from CABIN 
df_train['deck'] = df_train.Cabin.fillna('null').apply(lambda x: x.split('/')[0])
df_train['num_cabin'] = df_train.Cabin.fillna('null').apply(lambda x: x[2:][:-2])
df_train['side'] = df_train.Cabin.fillna('null').apply(lambda x: x.split('/')[-1])

In [None]:
df_train.head()

In [None]:
df_train.deck.value_counts()

In [None]:
names_deck = []
for idx,name in enumerate(df_train['deck'].value_counts().index.tolist()):
    names_deck.append(name)
names_deck

In [None]:
fig = px.bar(df_train, x=names_deck, y=df_train.deck.value_counts(), title="Zoom on Deck")
fig.show()

## D - Grouped Analysis

In [None]:
# lets define a function to plot a bar plot easily

def bar_plot(df,x,x_title,y,title,colors=None,text=None):
    fig = px.bar(x=x,
                 y=y,
                 text=text,
                 labels={"index": x_title},                             # replaces default labels by column name
                 data_frame=df,
                 color=colors,
                 barmode='group',
                 template="simple_white",
                 color_discrete_sequence=px.colors.qualitative.Prism)
    
    texts = [temp[col].values for col in y]
    for i, t in enumerate(texts):
        fig.data[i].text = t
        fig.data[i].textposition = 'inside'
        
    fig['layout'].title=title

    for trace in fig.data:
        trace.name = trace.name.replace('_',' ').capitalize()

    fig.update_yaxes(tickprefix="", showgrid=True)

    fig.show()

In [None]:
#create a new column based on survived column (replace Transported with survived and 0 Not Transported)
df_train['target'] = df_train['Transported'].map({False:'Not Transported',True:'Transported'})

In [None]:
df_train.head()

### VIP

In [None]:
# VIP
temp = pd.DataFrame()

for vip in pd.unique(df_train['VIP'].fillna('null')).tolist():
    if vip != 'null':
        temp[vip] = df_train[df_train['VIP']==vip]['target'].value_counts()
 
temp = temp.rename(columns={0:'False',1:'True'}).T
temp['Total sum'] = temp.sum(axis=1)

print(temp)

bar_plot(temp.reset_index(),
         'index',
         'VIP',
         ['Total sum','Transported','Not Transported'],
         title='Transported and Not transported grouped by VIP')

<h1>Conclusions</h1><br>
<li>Like we see more early, we don't have a lot of passenger with a VIP statut.
<li> The Transported rate between the no VIP and the VIP is close

### Cabin

#### Deck

In [None]:
temp = pd.DataFrame()

for d in pd.unique(df_train['deck'].fillna('null')).tolist():
    if d != 'null':
        temp[d] = df_train[df_train['deck']==d]['target'].value_counts()

temp = temp.rename(columns={0:'False',1:'True'}).T
temp['Total sum'] = temp.sum(axis=1)

print(temp)

bar_plot(temp.reset_index(),
         'index',
         'Deck',
         ['Total sum','Transported','Not Transported'],
         title='Transported and Not transported grouped by deck')

<h1>Conclusions</h1><br>
<li>Most of passengers are from the deck F and G.
<li> We can see that the decks where the passenger have been transported are : B - G and C
<li> Except the deck B and C the transported rate are still close 

#### Cabin side

In [None]:
temp = pd.DataFrame()

for side in pd.unique(df_train['side'].fillna('null')).tolist():
    if side != 'null':
        temp[side] = df_train[df_train['side']==side]['target'].value_counts()
 
temp = temp.rename(columns={0:'False',1:'True'}).T
temp['Total sum'] = temp.sum(axis=1)

print(temp)

bar_plot(temp.reset_index(),
         'index',
         'Side',
         ['Total sum','Transported','Not Transported'],
         title='Transported and Not transported grouped by side')

<h1>Conclusions</h1><br>
<li> We can note that the number of passenger between the two side P and S are nearly the same
<li> On side S we got a greater number of transported passenger

#### Age - Numerical Feature

In [None]:
df_train['age_category'] = pd.cut(df_train['Age'].fillna(df_train['Age'].mean()).astype(int), bins=[-1,11,18,22,27,33,40,66,100],
                            labels=["<=11","11-18","19-22","23-27","28-33","34-40","41-66",">=67"])

temp = pd.DataFrame()
for age in df_train['age_category'].unique().tolist():
    temp[age] = df_train[df_train['age_category']==age]['target'].value_counts()

temp = temp.T.reset_index()
temp['Total sum'] = temp.sum(axis=1)

bar_plot(temp.reset_index(),
         'index',
         'Age Category',
         ['Total sum','Transported','Not Transported'],
         title='Transported and Not-transported grouped by Age column')


fig = make_subplots(rows=2, cols=2,
                    specs=[[{"colspan": 2}, None],
                           [{}, {}]],
                    subplot_titles=('Age distribution',
                                    'Transported',
                                    'Not Transported'))

fig.add_trace(go.Histogram(x=df_train['Age']),
              row=1, col=1)

fig.add_trace(go.Histogram(x=df_train[df_train['target']=='Transported']['Age']),
              row=2, col=1)

fig.add_trace(go.Histogram(x=df_train[df_train['target']=='Not Transported']['Age']),
              row=2, col=2)

fig.update_layout(showlegend=False, title_text='Distribution for Age')
fig.show()

<h1>Conclusions</h1><br>
<li>Most of Passengers were between 41 and 66.
<li>Age column is positive skewed, meaning that few Passengers were higher than 50.
<li>The graph shows the relationship between Age and transported rate. It becomes apparent that age group between 15 and 30 has the worst transported rate. But the not transported is around the range too. 
<li> We can cponclude on the fact, that the Age category could be a good feature associated to the total_bill (like we did in the scatter plot)

In [None]:
df_train.info()

### ScatterMatrix

In [None]:
#correlation heatmap
corr = df_train.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True

cmap = sns.diverging_palette(180, 30, as_cmap=True)

with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(25, 25))
    sns.heatmap(corr,  mask=mask, cmap=cmap, annot=True, center=0, vmin=-1, vmax=0.8,
                square=True, cbar_kws={'shrink':.5, 'orientation': 'vertical'}, linewidth=.02);

In [None]:
#Correlation Map
ax = sns.clustermap(df_train.select_dtypes(exclude='object').corr())

## GLOBAL ANALYSIS

In [None]:
#create a function to plot multi box plots easily

def multi_box(df,cat_col,dist_col,color_col):
    
    y = []
    x = []
    
    if len(df[color_col].unique())!= 2:
        return 'Maximun number of unique values in the color columns is 2'
    
    for c in set(df[cat_col].unique().tolist()):
        for t in set(df[color_col].unique()):
            y.append(df[(df[cat_col]==c) & (df[color_col]==t)][dist_col].values)
            x.append(str(c)+' ('+str(t)+')')        

    colors = ['rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
              'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
              'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)',
              'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
              'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)', 
              'rgba(251, 43, 43, 0.5)', 'rgba(125, 251, 137, 0.5)']

    traces = []

    for xd, yd, cls in zip(x, y, colors[:2*len(df[cat_col].unique())]):
            traces.append(go.Box(y=yd,
                                 name=xd,
                                 boxpoints='all',
                                 jitter=0.5,
                                 whiskerwidth=0.2,
                                 fillcolor=cls,
                                 marker=dict(size=2),
                                 line=dict(width=1)))

    layout = go.Layout(title='{} distribution colord by {} grouped by {}'.format(dist_col.title(),
                                                                                 color_col.title(),
                                                                                 cat_col.title()),
        xaxis=dict(title=cat_col,
                   titlefont=dict(size=16)),
        
        yaxis=dict(title='Distribution',
                   autorange=True,
                   showgrid=True,
                   zeroline=True,
                   dtick=5,
                   gridcolor='rgb(255, 255, 255)',
                   gridwidth=1,
                   zerolinecolor='rgb(255, 255, 255)',
                   zerolinewidth=2,
                   titlefont=dict(
                   size=16)),
        
        margin=dict(l=40,
                    r=30,
                    b=80,
                    t=100),
        
        paper_bgcolor='rgb(255, 255, 255)',
        plot_bgcolor='rgb(255, 243, 192)',
        showlegend=False)

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [None]:
#create a function to plot a multi-violin easily

def multi_violin(df,iter_col,dist_col,color_col='target'):
    if len(df[color_col].unique())!= 2:
        return 'Maximun number of unique values in the color columns is 2'
    i = 0
    data = []
    for ite in df[iter_col]:
        data.append(go.Violin(x=df[df[iter_col]==ite][iter_col],
                              y=df[df[color_col]==df[color_col].unique()[0]][dist_col],
                              name=str(df[color_col].unique().tolist()[0]),
                              jitter=0,
                              meanline={'visible':True},
                              line={"color": '#F78181'},
                              side='negative',
                              marker=dict(color= '#81F781'),
                              showlegend=(i==0)))

        data.append(go.Violin(x=df[df[iter_col]==ite][iter_col],
                              y=df[df[color_col]==df[color_col].unique()[1]][dist_col],
                               name=str(df[color_col].unique().tolist()[1]),
                               jitter=0,
                               meanline={'visible':True},
                               line={"color": '#00FF40'},
                               side='positive',
                               marker=dict(color= '#81F781'),
                               showlegend=(i==0)))
        i+=1


    layout = dict(title='Distribution of {} column for each {} colored by {}'.format(dist_col.replace('_',' ').title(),
                                                                                     iter_col.replace('_',' ').title(),
                                                                                     color_col.replace('_',' ').title()),
                  width=1000,height=600,
                  yaxis=dict(title='Distribution',titlefont=dict(size=20)))

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)  

In [None]:
multi_box(df_train.dropna(),'HomePlanet','Age','target')

In [None]:
multi_box(df_train.dropna(),'CryoSleep','Age','target')

In [None]:
multi_box(df_train.dropna(),'Destination','Age','target')

In [None]:
multi_box(df_train.dropna(),'deck','Age','target')

In [None]:
multi_box(df_train.dropna(),'side','Age','target')

In [None]:
multi_box(df_train.dropna(),'VIP','Age','target')

## Name Analysis ?

--> Try to get the lastname to detect family group

In [None]:
df_train['last_Name'] = df_train.Name.dropna().apply(lambda x: x.split(' ')[-1])

In [None]:
names_passenger = []
for idx,name in enumerate(df_train['last_Name'].value_counts().index.tolist()):
    names_passenger.append(name)

In [None]:
fig = px.bar(df_train, x=names_passenger[:30], y=df_train.last_Name.value_counts()[:30], title="Zoom on Last Name")
fig.show()

In [None]:
fig = px.bar(df_train.dropna(), x="last_Name", y="Age", color="Destination",
             facet_row="target", facet_col="HomePlanet", hover_data = ['CryoSleep','VIP', 'deck','side'])
fig.show()

In [None]:
fig = px.bar(df_train.dropna(), x="last_Name", y="Age", color="deck",
             facet_row="target", facet_col="side", hover_data = ['HomePlanet','Destination','CryoSleep','VIP','Cabin'])
fig.show()

<h1>Conclusions</h1><br>
<li> It's interesting to take a deep look about the informations from the last_name category
<li> When we compare some people with the same last_name we can identify two things :
        - THE CRYOSLEEP is a good feature to analyse why some people are transported or not
        - The cabin informations (deck and side) give us some information too


<h1>Phase II : Data Processing</h1>
<a id="Phase II"></a>

In [None]:
# Data preparation for machine learning modeling

train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_PassengerId = test['PassengerId'].values #for submission

In [None]:
# Extracting data from --> 'Cabin'
train['deck'] = train.Cabin.dropna().apply(lambda x: x.split('/')[0])
train['num_cabin'] = train.Cabin.dropna().apply(lambda x: x[2:][:-2])
train['side'] = train.Cabin.dropna().apply(lambda x: x.split('/')[-1])

test['deck'] = test.Cabin.dropna().apply(lambda x: x.split('/')[0])
test['num_cabin'] = test.Cabin.dropna().apply(lambda x: x[2:][:-2])
test['side'] = test.Cabin.dropna().apply(lambda x: x.split('/')[-1])

# Extrat last_name from Name
train['last_Name'] = train.Name.dropna().apply(lambda x: x.split(' ')[-1])
test['last_Name'] = test.Name.dropna().apply(lambda x: x.split(' ')[-1])

train['num_cabin'] = train['num_cabin'].astype('float64')
test['num_cabin'] = test['num_cabin'].astype('float64')

#delete name column
del train['Name']
del test['Name']

#drop PassengerId column
del train['PassengerId']
del test['PassengerId']

#drop Cabin column
del train['Cabin']
del test['Cabin']

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum().plot(figsize=(12,8), title='Null values by feature on Train')

### Imputed missing value

In [None]:
df_train.info()

In [None]:
for col in train.select_dtypes(include='object'):
    print(col)

In [None]:
for col in train.select_dtypes(exclude='object'):
    print(col)