**Spacship titanic kaggle competition**. https://www.kaggle.com/c/spaceship-titanic/data?select=train.csv


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session





Importing libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.dtypes

In [None]:
train_data.isnull().any()

Let's drop columns that do not give useful information and store features and target variable

In [None]:
y = train_data['Transported']
train_data = train_data.drop(['PassengerId', 'Name', 'Transported'], axis = 1)
test_data = test_data.drop(['PassengerId', 'Name'], axis = 1)

Now it's time to work with NaN in data

Filling NaNs using .mode() and in case of Age feature using .mean()

In [None]:
for i in train_data.columns:
    if i == "Age":
        train_data[i] = train_data[i].fillna(train_data[i].mean())
        test_data[i] = test_data[i].fillna(test_data[i].mean())
    else:
        train_data[i] = train_data[i].fillna(train_data[i].mode()[0])
        test_data[i] = test_data[i].fillna(test_data[i].mode()[0])

In [None]:
train_data.isnull().any()

In [None]:
test_data.isnull().any()

Since everything works great, let's do some visualizations

In [None]:
#Creating a pallete for all plots
palette = sns.color_palette('flare')
sns.palplot(palette)

In [None]:
v_features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP']
for i in v_features:
    plt.figure(figsize=(15, 7))
    if i == 'Age':
        sns.countplot(x = train_data[i])
        plt.xticks(rotation = 90)
    else:
        sns.countplot(x = train_data[i], hue = y, palette=palette)
    plt.show()

Now let's encode our categorical features

In [None]:
train_data['Cabin'].nunique()

In [None]:
#since there are too many unique values in Cabin let's drop that column
train_data = train_data.drop(['Cabin'], axis = 1)
test_data = test_data.drop(['Cabin'], axis = 1)

In [None]:
#Time to transform features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_features = ['HomePlanet', 'Destination']
for i in cat_features:
    train_data[i] = le.fit_transform(train_data[i])
    test_data[i] = le.fit_transform(test_data[i])

In [None]:
y = le.fit_transform(y)

After transforming features we are ready to train models

In [None]:
#Now let's split the train dataset and try different models on it
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier, plot_importance
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
#Splitting values, test size = 20%
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size = 0.2)

**XGBoost**

In [None]:
%%time
xgb = XGBClassifier(booster = 'gbtree', verbosity = 0)
params = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.05, 0.1, 0.5],
    'max_depth' : [1, 3, 5, 7],
    'colsample_bytree': [0.5, 1],
}
searcher = GridSearchCV(xgb, params, cv = 3)
searcher.fit(X_train, y_train)

In [None]:
#best model parameters
searcher.best_params_

In [None]:
#I've decided to try different parameters on my own and the result's improved a bit(about 0.3%)
xgb = XGBClassifier(booster = 'gbtree', verbosity = 0, max_depth = 3, n_estimators = 400, 
                    learning_rate = 0.03, colsample_bytree = 0.5)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_score = accuracy_score(xgb_pred, y_test)
xgb_score

In [None]:
plot_importance(xgb, height=0.4, grid=False)

**CatBoost**

In [None]:
cat = CatBoostClassifier(verbose = 0)
params = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.05, 0.1, 0.5],
    'l2_leaf_reg' : [0.1, 0.5, 1],
    'max_depth' : [1, 3, 5, 7]
}
searcher = GridSearchCV(cat, params, cv = 3)
searcher.fit(X_train, y_train)

In [None]:
#best model parameters
searcher.best_params_

In [None]:
#I've decided to try different parameters on my own and the result's improved a bit(about 0.8%)
cat = CatBoostClassifier(n_estimators=100, learning_rate= 0.05, max_depth=7, l2_leaf_reg= 1, verbose = 0)
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
cat_score = accuracy_score(cat_pred, y_test)
cat_score

**Random Forest**

In [None]:
rf = RandomForestClassifier()
params = {
    'n_estimators' : [100, 200, 300],
    'criterion' : ['entropy', 'gini'],
    'max_depth' : [1, 3, 7, 9],
}
searcher = GridSearchCV(rf, params, cv = 3)
searcher.fit(X_train, y_train)

In [None]:
searcher.best_params_

In [None]:
rf = RandomForestClassifier(criterion='entropy', n_estimators = 1000, max_depth = 10)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_score = accuracy_score(rf_pred, y_test)
rf_score

**Decision Tree**

In [None]:
dt = DecisionTreeClassifier()
params = {
    'criterion' : ['entropy', 'gini'],
    'max_depth' : [i for i in range(11)],
    'splitter' : ['best', 'random'],
}
searcher = GridSearchCV(dt, params, cv = 5)
searcher.fit(X_train, y_train)

In [None]:
searcher.best_params_

In [None]:
dt = DecisionTreeClassifier(splitter='best', criterion='entropy', max_depth=9)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_score = accuracy_score(dt_pred, y_test)
dt_score

**Scores comparison**

In [None]:
ind = ['XGBoost', 'CatBoost', 'RandomForest', 'DecisionTree']
scores = pd.DataFrame({'Scores' : [xgb_score, cat_score, rf_score, dt_score]}, index=ind)

In [None]:
scores

In my case CatBoost shows the best result on test dataset

**Submission**

In [None]:
test_data_sub = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_data_sub.head()

In [None]:
pred_sub = cat.predict(test_data)
pred_sub = pred_sub > 0

In [None]:
submission = pd.DataFrame({'PassengerId':test_data_sub['PassengerId'], 'Transported' : pred_sub})

In [None]:
submission.to_csv('sub', index=False)

In [None]:
submission