In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

bg_color = 'white'
ktcolors = ['#d0384e', '#ee6445', '#fa9b58', '#fece7c', '#fff1a8', '#f4faad', '#d1ed9c', '#97d5a4', '#5cb7aa', '#3682ba']
sns.set(rc={"font.style":"normal",
            "axes.facecolor":bg_color,
            "figure.facecolor":bg_color,
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':20,
            'figure.figsize':(5.0, 5.0),
            'xtick.labelsize':10,
            'font.size':10,
            'ytick.labelsize':10})

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import data from files

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df_test.head()

## Basic EDA, nulls handling and pre-processing

In [None]:
df_train.info()

### Categories


I can see there are boolean features but have Dtype 'object'.
Changing dtype to bool will also take care of the empty values.

In [None]:
df_train.CryoSleep = df_train.CryoSleep.astype(bool)
df_train.VIP = df_train.VIP.astype(bool)

df_train.info()

In [None]:
## Do the same on Test DF
df_test.CryoSleep = df_test.CryoSleep.astype(bool)
df_test.VIP = df_test.VIP.astype(bool)


What is in Name?

In [None]:
df_train.sample(15)

I don't think Name is relevant here.

In [None]:
# dropping unneccessary cols

df_train.drop(['Name'], axis=1, inplace=True)
df_test.drop(['Name'], axis=1, inplace=True)

#### HomePlanet

In [None]:
df_train.HomePlanet.unique()

In [None]:
df_train.HomePlanet = df_train.HomePlanet.astype('category')
df_test.HomePlanet = df_test.HomePlanet.astype('category')

df_train.info()

#### Destination

In [None]:
df_train.Destination.unique()

In [None]:
df_train.Destination = df_train.Destination.astype('category')
df_test.Destination = df_test.Destination.astype('category')

df_train.info()

### What about nulls?

In [None]:
df_train.isnull().sum()

... so many nulls...

In [None]:
df_test.isnull().sum()

Numerical missing values will be filled with mean for Age and 0 for pricing values (RoomService, FoodCourt, ShoppingMall, Spa, VRDeck)

In [None]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)
df_train.isnull().sum()

In [None]:
pricing_cols = ['RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck']
for col in pricing_cols:
    df_train[col].fillna(0, inplace=True)
    df_test[col].fillna(0, inplace=True)
    
df_train.isnull().sum()

Categorical will be filled with most common value:

In [None]:
for col in df_train.isnull().sum().index[0:-1]:
    temp = df_train[col].value_counts().index[0]
        
    df_train[col] = df_train[col].fillna(temp)
    df_test[col] = df_test[col].fillna(temp)

In [None]:
print(f'Training NaNs:\n{df_train.isnull().sum()}\n\nTesting NaNs:\n{df_test.isnull().sum()}')
print(f'\nThe data contains {df_train.isnull().sum().sum() + df_test.isnull().sum().sum()} NaNs')

### Encoding

In [None]:
df_train = pd.get_dummies(data=df_train, columns=['HomePlanet', 'Destination'],)
df_train.columns

In [None]:
df_test = pd.get_dummies(data=df_test, columns=['HomePlanet', 'Destination'],)
df_test.columns

## Model

In [None]:
# train_feature_names = ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
#                        'CryoSleep', 
#                        'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
#                        'Age', 'VIP', 'RoomService', 
#                        'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
# training_target = df_train['Transported'].values

# train_features=df_train[train_feature_names].values

# train_features

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# mlmodel = RandomForestClassifier()
# mlmodel.fit(train_features, training_target)

# print('Linear model score: ', mlmodel.score(train_features, training_target))

In [None]:
features = ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
                       'CryoSleep', 
                       'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
                       'Age', 'VIP', 'RoomService', 
                       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

y = df_train.Transported
X = df_train[features]
X_test = df_test[features]

model = RandomForestClassifier()
model.fit(X, y)
pred = model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': df_test.PassengerId,
                       'Transported': pred})
output.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")

In [None]:
# df_train['Age'].plot.hist()

In [None]:
# print("How many have survived?")
# print(df_train.Transported.value_counts(normalize=True))
# sns.countplot(df_train['Transported'])
# plt.show()

Wow, almost even.

In [None]:
# seaborn pairplot for numeric data
plt.rcParams["figure.figsize"] = (15,15)
sns.pairplot(data = df_train, vars=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'], hue = 'Transported')
plt.show()

### HomePlanet vs survival

In [None]:
# df_train.HomePlanet.unique()

In [None]:
# df_train.groupby(['HomePlanet']).Transported.mean()

In [None]:
# df_train.HomePlanet[df_train.Transported == 1].value_counts()

In [None]:
# df_train.HomePlanet[df_train.Transported == 1].value_counts(normalize=True).plot(kind='bar', 
#                                                                                  color=ktcolors)
# plt.xticks(
#     rotation=0, 
#     horizontalalignment='center',
#     fontweight='light',
#     fontsize='x-large'
# )
# plt.xlabel('Home Planet')
# plt.ylabel('Frequency')

### VIP vs survival

In [None]:
df_train.VIP[df_train.Transported == 0].value_counts().plot(kind='bar', color=ktcolors)
plt.xticks(
    rotation=0, 
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-large'
)
plt.xlabel('VIP')
plt.ylabel('Frequency')