In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the data
First things first I got to load the data.

In [3]:
#loading data
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df_sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [4]:
df_test.info()

# Data Exploration
Now we need to explore the data. Check for missing values, view descriptive statistics, check correlations and skew. 

In [5]:
#check dataframe
df_train.head(5)

In [6]:
#get shape (rows/columns)
df_train.shape

In [None]:
#check for empty values and datatypes
df_train.info()

In [7]:
#display descriptive statistics
df_train.describe()
#data is very skewed!

In [None]:
#display correlations
df_train.corr(method='pearson')

In [None]:
df_train.skew()

# Data Visualization
Lets create some basic graphs to visualize our data

In [8]:
#histogram overview
df_train.hist(figsize = (20, 10)) 

In [9]:
df_train['RoomService'].hist(bins=[0, 20, 40])

In [None]:
#density plot
df_train.plot(kind='density', figsize = (20, 10), subplots=True, layout=(3,3), sharex=False)

In [None]:
#box and whisker plot
df_train.plot(kind='box', subplots=True, figsize = (20, 10), layout=(3,3), sharex=False, sharey=False) 

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x='HomePlanet', hue= 'Transported', data=df_train) 

In [None]:
sns.countplot(x='CryoSleep', hue= 'Transported', data=df_train) 

In [None]:
sns.countplot(x='Destination', hue= 'Transported', data=df_train) 

In [None]:
sns.countplot(x='VIP', hue= 'Transported', data=df_train) 

# Data Munging
So I can go about this a bunch of different ways. The fastest way is to drop everything that has an NA value and then run ML algorithms. Doing this takes you from 8700 to 6600 entries, 20-25% of data lost. Yikes that's a lot!So lets go through the columns on by one and try filling in missing values. There's a lot of "NaN" values in both object and float datatype columns. There's also a lot of "0" values in spending for various services, I'll have to decide if I treat that as a NA value or if 0 is the correct value.

The easiest way to go about filling NA values is to just use the mode for categoricals and median for numeric values. 

In [10]:
df_train.info()

In [11]:
#split cabin into 3 columns, there are 3 types of data contained in this

df_train[['Cabin1', 'Cabin2', 'Cabin3']] = df_train['Cabin'].str.split('/', expand = True)


#create lists of categoricals/numerics/identifiers
categoricals = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin1', 'Cabin3']
numerics = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
identifiers = ['PassengerID', 'Name', 'Cabin2']


In [12]:
#replace na's with most common for categoricals
for i in categoricals:
    df_train[i].fillna(df_train[i].mode()[0], inplace=True)
    
#replace na's with median for numerics
for i in numerics:
    df_train[i].fillna(df_train[i].median(), inplace=True)

In [13]:
#combine costs into 'services
df_train['Services'] = df_train['RoomService']+df_train['FoodCourt']+df_train['ShoppingMall']+df_train['Spa']+df_train['VRDeck']

In [14]:
df_train.loc[df_train['Services'] == 0, 'BoughtServices'] = 0
df_train.loc[df_train['Services'] > 0, 'BoughtServices'] = 1

## Data Munging -  Replacing categoricals with numbers

In [15]:
#Home Planet
df_train['HomePlanet'].replace(['Earth','Europa', 'Mars'],[0,1, 2],inplace=True)
df_train['CryoSleep'].replace(['False','True'],[0,1],inplace=True)
df_train['Destination'].replace(['TRAPPIST-1e','55 Cancri e','PSO J318.5-22'],[0,1,2],inplace=True)
df_train['Cabin1'].replace(['F','G', 'E', 'B', 'C', 'D', 'A', 'T'],[0,1, 2, 3, 4, 5, 6, 7],inplace=True)
df_train['Cabin3'].replace(['S','P'],[0,1],inplace=True)
df_train['VIP'].replace(['False','True'], [0,1], inplace=True)

In [16]:
df_train.info()

In [17]:
#drop columns I don't want to use
df_train.drop(['PassengerId','Cabin','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Cabin2' ],axis=1,inplace=True) 

In [None]:
df_train.info()

## Visualization Redux

How have things changes now that we cleaned up the data?

In [None]:
df_train.hist(figsize = (20, 10)) 

## Data Munging Conclusion
I have eliminated all NaN values (in a veryrough way) Split cabin into two categorical variables, created a services column to represent the 5 things people spend money on and dropped several columns. I transformed categoricals into numerical place holders. I have a total of 8 pieces of data aplut our attribute of interest (Transported). 

# Data Transformation
The skew on our 'Services' column is terrible, I'll try a logarithmic transform to increase data normality.

In [18]:
df_train['Services']=np.log(1+df_train['Services'])

In [19]:
df_train.hist(figsize = (20, 10)) 

# Machine Learning
I will be running the data through several algorithms, using cross validation to score the data, then choosing a couple of the higher performing algorithms to run hyperparameterization on. 


In [22]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

In [23]:
X.head()

In [28]:
#create lists for relevant model metrics, this will be used to compare different models
model = []
model_mean = []
model_std = []

#k-fold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
X = X
y = y

#folds, adjust based on # of data points
folds = 5
kfold = KFold(n_splits = folds, random_state = 1, shuffle=True)

In [29]:
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
method = DecisionTreeClassifier(random_state = 1) 
results = cross_val_score(method, X, y, cv=kfold)
model.append(method)
model_mean.append(round(results.mean(), 4))
model_std.append(round(results.std(), 4))
print(results.mean())

In [30]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
method = RandomForestClassifier(random_state=1) 
results = cross_val_score(method, X, y, cv=kfold)
model.append(method)
model_mean.append(round(results.mean(), 4))
model_std.append(round(results.std(), 4))


In [31]:
#gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
method = GradientBoostingClassifier(random_state=1) 
results = cross_val_score(method, X, y, cv=kfold)
model.append(method)
model_mean.append(round(results.mean(), 4))
model_std.append(round(results.std(), 4))
print(results.mean())

In [35]:
model table
models = pd.DataFrame({
    'Model' : model,
    'Score' : model_mean,
    'std' : model_std
})


models.sort_values(by = 'Score', ascending = False)

## Machine Learning - Model Choice
We tried out three models, the gradient boosting classifier worked the best. Now I want to iterate through important hyper parameters of the model of choice to see if I can improve on it. The parameters to iterate through are particular to each type of model,I chose learning rate, n estimators and max leaf nodes based off of the documentation. These should get me most of the way there.

In [34]:
from sklearn.ensemble import GradientBoostingClassifier
best_score = 0


for learning_rate in [0.1, 1, 10, 100]:
    for n_estimators in [50, 100, 500, 1000]:
        for max_leaf_nodes in [2, 4, 5]:
            gbc = GradientBoostingClassifier(random_state=1, learning_rate=learning_rate, n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes)
            scores = cross_val_score(gbc, X, y, cv = 5)
            score = np.mean(scores)
            if score > best_score:
                best_score = score
                best_std = np.std(scores)
                best_parameters = {'learning rate':learning_rate, 'n estimators':n_estimators, 'max leaf nodes': max_leaf_nodes}
                
                
print(best_score)
print(best_std)
print(best_parameters)

## Machine Learning Hyperparameterization
That took a long time to iterate through! Out best parameters are leanint rate of 0.1, n estimators of 100, max leaf nodes of 5. But, it only barely improved the predictivity of the model. To be honest, the most bang for my buck would be spending more time data munging, but I want to get an initial model out here first, I can improve upon it later.