In [None]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install dataprep

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats as st

from dataprep.eda import plot, plot_correlation, create_report, plot_missing

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
plot(train)

### Analysis

- Passenger Id and Name corresponds to the features of the passenger details
- Home, Destination, Cryosleep and VIP are categorical while CryoSleep and VIP are Boolean variables
- Age,RoomService, FoodCourt, ShoppingMall, Spa and VRDeck are numerical

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### Data Cleaning

- Analysing each column with null values. To start with objects datatypes are analyzed

In [None]:
train.shape

In [None]:
test.shape

# Analyzing Boolean Variables

### Analyzing CrypoSleep

In [None]:
train['CryoSleep'].value_counts()

In [None]:
test['CryoSleep'].value_counts()

In [None]:
train['CryoSleep']  = train['CryoSleep'].astype(str).str.strip()
train['CryoSleep'] = train['CryoSleep'].replace({'True':True, 'False':False, '<NA>':np.nan, 'nan':np.nan})
print('Unique Value ======================', train['CryoSleep'].unique())
print('Null Instances of CryoSleep ======================', train['CryoSleep'].isnull().sum())

In [None]:
plt.figure(figsize=(14,8))
plt.title("CryoSleep vs Transported")
sns.countplot(x=train['CryoSleep'], hue=train['Transported'])

### Analysis
- Could be seen that while True many passengers were transported 

### Lets use SimpleImputer to replace missing values

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')


In [None]:
train['CryoSleep'].unique()

In [None]:
train['CryoSleep'] = imputer.fit_transform(train['CryoSleep'].values.reshape(-1,1))[:,0]
test['CryoSleep'] = imputer.fit_transform(test['CryoSleep'].values.reshape(-1,1))[:,0]

### Analyzing VIP

In [None]:
train['VIP'].value_counts()

In [None]:
test['VIP'].isnull().sum()

In [None]:
train['VIP']  = train['VIP'].astype(str).str.strip()
train['VIP'] = train['VIP'].replace({'True':True, 'False':False, '<NA>':np.nan, 'nan':np.nan})

print('Unique Value ======================', train['VIP'].unique())
print('Null Instances of VIP ======================', train['VIP'].isnull().sum())

print('Unique Value ======================', test['VIP'].unique())
print('Null Instances of VIP ======================', test['VIP'].isnull().sum())

Similar to CryoSleep VIP is also boolean column. Hence dropping rows containing the null values as it cannot be filled with any random values

In [None]:
train['VIP'] = imputer.fit_transform(train['VIP'].values.reshape(-1,1))[:,0]
test['VIP'] = imputer.fit_transform(test['VIP'].values.reshape(-1,1))[:,0]

In [None]:
train.describe()

- For the following numerical features, lets replace the null values with the mean 

 - Age
 - RoomService
 - FoodCourt
 - ShoppingMall
 - Spa
 - VRDeck

In [None]:
train['Age'].fillna(np.mean(train['Age']), inplace=True)
train['RoomService'].fillna(np.mean(train['RoomService']), inplace=True)
train['FoodCourt'].fillna(np.mean(train['FoodCourt']), inplace=True)
train['ShoppingMall'].fillna(np.mean(train['ShoppingMall']), inplace=True)
train['Spa'].fillna(np.mean(train['Spa']), inplace=True)
train['VRDeck'].fillna(np.mean(train['VRDeck']), inplace=True)

- Performing the same on the test dataset

In [None]:
test['Age'].fillna(np.mean(test['Age']), inplace=True)
test['RoomService'].fillna(np.mean(test['RoomService']), inplace=True)
test['FoodCourt'].fillna(np.mean(test['FoodCourt']), inplace=True)
test['ShoppingMall'].fillna(np.mean(test['ShoppingMall']), inplace=True)
test['Spa'].fillna(np.mean(test['Spa']), inplace=True)
test['VRDeck'].fillna(np.mean(test['VRDeck']), inplace=True)

## Analyzing Cabin

In [None]:
len(train['Cabin'].unique())

- Cabin contains 6312 unique vaulues. Dropping the rows containing null values for cabin

In [None]:
train[["Deck", "CabinNum", "Side"]] = train['Cabin'].str.split("/", expand = True)
test[["Deck", "CabinNum", "Side"]] = test['Cabin'].str.split("/", expand = True)


In [None]:
print('Unique len of cabinumber',len(train['CabinNum'].unique()))
print('Unique len of side',len(train['Side'].unique()))
print('Unique len of Deck',len(train['Deck'].unique()))

print('===============TEST Values =================')
print('Unique len of cabinumber',len(test['CabinNum'].unique()),test['CabinNum'].isnull().sum() )
print('Unique len of side',len(test['Side'].unique()),test['Side'].isnull().sum())
print('Unique len of Deck',len(test['Deck'].unique()),test['Deck'].isnull().sum())

- Dropping Cabinumbers and Cabin from both train and test dataset


In [None]:
train = train.drop(['Cabin','CabinNum'],axis=1)
test = test.drop(['Cabin','CabinNum'],axis=1)

- Imputing the Side and Deck with most frequent values

In [None]:
train['Side'] = imputer.fit_transform(train['Side'].values.reshape(-1,1))[:,0]
test['Side'] = imputer.fit_transform(test['Side'].values.reshape(-1,1))[:,0]

In [None]:
train['Deck'] = train['Deck'].astype(str)
train['Deck'] = train['Deck'].replace({'<NA>':np.nan,'nan':np.nan})
train['Deck'].unique()


In [None]:
train['Deck'] = imputer.fit_transform(train['Deck'].values.reshape(-1,1))[:,0]
test['Deck'] = imputer.fit_transform(test['Deck'].values.reshape(-1,1))[:,0]

### Analyzing HomePlanet

In [None]:
print(train['HomePlanet'].value_counts())

print(train['HomePlanet'].unique())


- Before replacing the null values for HomePlanet lets perform the correlation between Transported variable and homeplanet

In [None]:
plt.figure(figsize=(14,8))
plt.title("Travel from Home planet")
sns.countplot(x=train['HomePlanet'])

### Analysis
- Maximum number of transportation were carried out from palnet Earth followed by Europa and Mars
- Hence lets replace the null instances with mode of the Homeplanet



In [None]:
train['HomePlanet'] = train['HomePlanet'].astype(str)
train['HomePlanet'] = train['HomePlanet'].replace({'<NA>': np.nan, 'nan': np.nan})

In [None]:
# replacing dummy variable for home planet for time being
train['HomePlanet'].fillna(st.mode(train['HomePlanet']).mode[0], inplace=True)
test['HomePlanet'].fillna(st.mode(test['HomePlanet']).mode[0], inplace=True)

In [None]:
plt.figure(figsize=(14,8))
plt.title("Travel vs Transported from Home planet")
sns.countplot(x=train['HomePlanet'], hue=train['Transported'])

In [None]:
print('Europa vs Transported ',len(train[(train['HomePlanet']=='Europa') & (train['Transported']==True)])/len(train[(train['HomePlanet']=='Europa')]))
print('Mars vs Transported ',len(train[(train['HomePlanet']=='Mars') & (train['Transported']==True)])/len(train[(train['HomePlanet']=='Mars')]))
print('Earth vs Transported ',len(train[(train['HomePlanet']=='Earth') & (train['Transported']==True)])/len(train[(train['HomePlanet']=='Earth')]))

- It could be observed that from Europa planet more than 65% of the people were transported
- From Earth 42% people were transported successfully while remaining were not
- From Mars 52% people were transported successfully while remaining were not

### Analyzing Destination


In [None]:
print(train['Destination'].value_counts())

print(train['Destination'].unique())

Further analyzing the Destination feature

In [None]:
train['Destination'] = train['Destination'].astype(str)
train['Destination'] = train['Destination'].replace({'<NA>': np.nan, 'nan': np.nan})

In [None]:
print(train['Destination'].unique())

In [None]:
plt.figure(figsize=(14,8))
plt.title("Travel to Destination")
sns.countplot(x=train['Destination'])

### Analysis
- Trappist-le seems to be most preferred destination followed by 55 Cancri e and PSO J318 5-22

### Using mode to replace the null values for destination

In [None]:
train['Destination'].fillna(st.mode(train['Destination']).mode[0], inplace=True)
test['Destination'].fillna(st.mode(test['Destination']).mode[0], inplace=True)

In [None]:
plt.figure(figsize=(14,8))
plt.title("Travel vs Transported to Destination planet")
sns.countplot(x=train['Destination'], hue=train['Transported'])

In [None]:
print('TRAPPIST-1e vs Transported ',len(train[(train['Destination']=='TRAPPIST-1e') & (train['Transported']==True)])/len(train[(train['Destination']=='TRAPPIST-1e')]))
print('PSO J318.5-22 vs Transported ',len(train[(train['Destination']=='PSO J318.5-22') & (train['Transported']==True)])/len(train[(train['Destination']=='PSO J318.5-22')]))
print('55 Cancri e vs Transported ',len(train[(train['Destination']=='55 Cancri e') & (train['Transported']==True)])/len(train[(train['Destination']=='55 Cancri e')]))

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

- Some of the rows in passenger names contain null still. Lets drop the Name column

In [None]:
test_id = test['PassengerId']
test_id

In [None]:
train = train.drop('Name',axis=1)
test = test.drop('Name',axis=1)

### Analysis
- Now train and test data has been cleaned. Lets do further analysis and start with model building

In [None]:
plt.figure(figsize=(14,8))
plt.title("Transported vs Failed to Transported")
sns.countplot(x=train['Transported'])

### Analysis
- It could be seen there is no class imbalance between transported vs non-trasported

## Data Preparatation and Initialiaze H20

In [None]:
%%capture

!pip install h2o

In [None]:
import h2o
h2o.init()

### Analysis
- Removing passenger id

In [None]:
train.info()

In [None]:
df_train = train.copy()
df_test = test.copy()

In [None]:
df_train = df_train.drop('PassengerId', axis=1)
df_test = df_test.drop('PassengerId', axis=1)


In [None]:
# creating H20Frame using train dataset

h2o_df = h2o.H2OFrame(df_train)

In [None]:
h2o_df_test = h2o.H2OFrame(df_test)

In [None]:
df_train.columns

In [None]:
h2o_train, h2o_val = h2o_df.split_frame(ratios=[.80])


# Identify predictors and response
x = h2o_train.columns
y = "Transported"
x.remove(y)

In [None]:
from h2o.automl import H2OAutoML

In [None]:
aml = H2OAutoML(max_runtime_secs=600,
                #exclude_algos=['DeepLearning'],
                seed=1,
                #stopping_metric='logloss',
                #sort_metric='logloss',
                balance_classes=False,
                project_name='SpaceShip'
)
%time aml.train(x=x, y=y, training_frame=h2o_train, validation_frame=h2o_val)

## AutoML leaderboard

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
perf = aml.leader.model_performance()

In [None]:
perf.plot()

### Analysis 
- AUC of the model  is 93.71% 
- StackedEnsemble_AllModels_3_AutoML_1_20220521_161843 is selected as the top performing model

- Lets try to predict using the best performing model


In [None]:
len(h2o_df_test)

In [None]:
predicted_values = aml.leader.predict(h2o_df_test)

In [None]:
df_predict = h2o.as_list(predicted_values)
len(df_predict)

## Finally saving the predicted values

In [None]:
len(test_id)

In [None]:
df = pd.DataFrame({'PassengerId': test_id, 'Transported': df_predict['predict'].values})
df.to_csv('submission.csv', index = False)