In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 

In [28]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### About 2% of all the columns have some nan values. Impute the nans with a pipeline

In [73]:
print(df['HomePlanet'].isna().sum()/ df['HomePlanet'].shape[0]) # 2% 
{col : df[col].isna().sum() / df[col].shape[0] for col in df.columns}

0.023122052225928908


{'PassengerId': 0.0,
 'HomePlanet': 0.023122052225928908,
 'CryoSleep': 0.02496261359714713,
 'Cabin': 0.02289198205452663,
 'Destination': 0.02093638559760727,
 'Age': 0.020591280340503854,
 'VIP': 0.023352122397331185,
 'RoomService': 0.02082135051190613,
 'FoodCourt': 0.021051420683308408,
 'ShoppingMall': 0.02392729782583688,
 'Spa': 0.021051420683308408,
 'VRDeck': 0.021626596111814105,
 'Name': 0.023007017140227768,
 'Transported': 0.0}

In [88]:
# Target selection
y = df.Transported 
X = df.drop(['Transported'], axis=1)

In [83]:
# Feature selection
# low cardinality categorical features
# all numerical features
categorical_features = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == 'object']
numerical_features = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Keep only selected features
my_cols = categorical_features + numerical_features
X = X[my_cols].copy()

X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,False,39.0,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,False,24.0,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,True,58.0,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,False,33.0,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,False,16.0,303.0,70.0,151.0,565.0,2.0


In [85]:
# Numerical preprocessor
numerical_transformer = SimpleImputer(strategy='constant')

# Categorical preprocessor
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
    ]
)

## Start with a Random Forest Classifier

In [91]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=500)

# bundle into pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [99]:
# Cross validation score
from sklearn.model_selection import cross_val_score 

scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')
print(f'Accuracy scores:\n {scores}')

Accuracy scores:
 [0.78608396 0.78205865 0.78608396 0.79171461 0.79516686]


## Pretty good accuracy for my first attempt!

In [98]:
# input_X_train, input_X_test, input_y_train, input_y_test = train_test_split(X, y)
# my_pipeline.fit(input_X_train, input_y_train)
# preds = my_pipeline.predict(input_X_test)
# print(f'Accuracy score: {accuracy_score(input_y_test, preds)}')

Accuracy score: 0.7769089236430543


In [100]:
test_data = pd.read_csv('test.csv')

X_test = test_data[my_cols]
my_pipeline.fit(X, y)
preds = my_pipeline.predict(X_test)

output = pd.DataFrame({
    'PassengerId' : test_data['PassengerId'],
    'Transported' : preds
})

In [102]:
output.to_csv('submission.csv', index=False)