In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = 20,7
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings(action='ignore')


In [None]:
data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

## Check for Null values and Data Types

In [None]:
def data_info(*,data):    
    print(f'Shape of the data: {data.shape}')
    print("===="*10)
    print(f'Total Number of Null values: {data.isnull().sum().sum()}')
    print("===="*10)
    print("Columns With Null values")
    print(data[data.columns[data.isnull().sum()>0]].isnull().sum())
    print("===="*10)
    print(f"% of Null Values in dataset {round(data.isnull().sum().sum()/np.product(data.shape)*100)}%")

In [None]:
data_info(data=data)

## Feature Engineering

In [None]:
# Getting the Group id
data['gggg_group'] = data['PassengerId'].apply(lambda x:x.split('_')[0])

In [None]:
#Getting the deck/num/sides from cabin
#There are Null Values in Cabin will do forward Fill to the data columns
data['Cabin'] = data['Cabin'].fillna(method='bfill')
data['Cabin_deck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
data['Cabin_num'] = data['Cabin'].apply(lambda x: x.split('/')[1])
data['Cabin_sides'] = data['Cabin'].apply(lambda x: x.split('/')[2])

## Descriptive Analysis

In [None]:
def unilatral_info(*,data):   
    '''This function will provide uilatral anaysis of spaceship data'''
    # How many Passengers groups has boarded the ship
    print("===="*10)
    print(f"Passengers groups has boarded the ship {len(data['gggg_group'].value_counts())}")
    print("===="*10)
    # From how many planets passengers are there
    print(f"Number of Planet's passenger boarded the ship {len(data['HomePlanet'].value_counts())}")
    print("===="*10)
    print(f"Plannet Wise passenger distribution \n{data['HomePlanet'].value_counts()}")
    print("===="*10)
    print(f"Passengers have been put to CyroSleep \n{data['CryoSleep'].value_counts()}")
    print("===="*10)
    print(f"Passengers Destination \n{data['Destination'].value_counts()}")
    print("===="*10)
    print(f"Total Cabin Decks \n{data['Cabin_deck'].value_counts()}")
    print("===="*10)
    print(f"Total Cabin Numbers \n{len(data['Cabin_num'].unique())}")
    print("===="*10)
    print(f"Total Cabin Sides \n{data['Cabin_sides'].value_counts()}")
    print("===="*10)
    print(f"VIP Customer Boarded the Ship \n{data['VIP'].value_counts()}")

In [None]:
unilatral_info(data=data)

## Numerical Distribution

In [None]:

plt.figure(figsize=(14,7))
plt.subplot(2,2,1)
sns.distplot(x=data['Age'])
plt.title("Distribution of Age in the Passeger List")

plt.subplot(2,2,2)
sns.distplot(x=data['RoomService'])
plt.title(f"Distribution of Room Service in the Passeger List")

plt.subplot(2,2,3)
sns.distplot(x=data['FoodCourt'])
plt.title("Distribution of Room Service in the Food Court")

plt.subplot(2,2,4)
sns.distplot(x=data['ShoppingMall'])
plt.title("Distribution of Room Service in the Shopping Mall")

plt.tight_layout()
plt.show()

## Bilatral Analysis with target Variable: Transported

In [None]:
# add data annotation to the graph columns
# for this function pass, the graph reference and data frame as parameters
def add_annotate(ax,data):
    total_data = data.shape[0]
    for p in ax.patches:
       
        placement_of_text_y = p.get_height() + 100
        placement_of_text_x = p.get_x() + (p.get_width()/2)
        horizontal_alignment_text = 'center'
        color_text = 'black'

        percent = round((p.get_height()/total_data)*100,1)
        label_text = str(p.get_height()) + ', ' + str(percent) + '%'

        # i=i+1
        ax.text(placement_of_text_x, placement_of_text_y, label_text)

In [None]:
data.head()

### Catagorical Analysis

In [None]:
plt.figure(figsize=(16,20))

plt.subplot(4,2,1)
plt.title("Number of Passengers Transported", fontsize=12)
ax = sns.countplot(x='Transported', data=data)
add_annotate(ax=ax,data=data)

plt.subplot(4,2,2)
plt.title("Home Planet vs Transported", fontsize=12)
ax = sns.countplot(x='HomePlanet', data=data, hue='Transported')
add_annotate(ax=ax,data=data)

plt.subplot(4,2,3)
plt.title("CryoSleep vs Transported", fontsize=12)
ax = sns.countplot(x='CryoSleep', data=data, hue='Transported')
add_annotate(ax=ax,data=data)

plt.subplot(4,2,4)
plt.title("Destination vs Transported", fontsize=12)
ax = sns.countplot(x='Destination', data=data, hue='Transported')
add_annotate(ax=ax,data=data)

plt.subplot(4,2,5)
plt.title("Cabin_deck vs Transported", fontsize=12)
ax = sns.countplot(x='Cabin_deck', data=data, hue='Transported')
add_annotate(ax=ax,data=data)

plt.subplot(4,2,6)
plt.title("Cabin_sides vs Transported", fontsize=12)
ax = sns.countplot(x='Cabin_sides', data=data, hue='Transported')
add_annotate(ax=ax,data=data)

plt.show()

### Hybrid Analysis - Cabin Analysis

In [None]:
cont_vars = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [None]:
plt.figure(figsize=(14,20))
nrows = 5
ncols = 1
c = 1 
for var in cont_vars:
    plt.subplot(nrows,ncols,c)
    sns.barplot(x=var,y='Cabin_deck',hue='Transported', data=data)
    plt.title(f"Passengers Cabin deck expenditure in {var} with Transported")
    c=c+1

plt.tight_layout()
plt.show()

### Spending of Transported Passengers

In [None]:
cont_vars = ['FoodCourt','ShoppingMall','Spa','VRDeck']
plt.figure(figsize=(14,20))
nrows = 5
ncols = 1
c = 1 
for var in cont_vars:
    plt.subplot(nrows,ncols,c)
    plt.title(f"Transported passengers in terms of sepending in {var} and Room Service")
    sns.scatterplot(x=var,y='RoomService',hue='Transported', data=data)
    c=c+1
plt.tight_layout()
plt.show()

 #### Report
    - 50% of people were Transported
    - People who spend more in food court and from deck T has higher Trasported rate
    - People spend more in Shopping mall has higher transported rate.    
    - People with homeplanet as Mars and Europa has higher % of Trasported People 
    - People in CryoSleep has higher % of being Transported 
    - People with destination 55 Canori and PSO J318 has higher % of Transported people 
    - People in Cabin B and C has higher chaces of being Transported