# Binary Classification with a Bank Churn Dataset
## Import libraries

In [None]:
import numpy as np
import pandas as pd

# import matplotlib.pyplot as plt
# import seaborn as sns

# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split, GridSearchCV

from catboost import CatBoostClassifier

SEED = 31415

## Data download and light analysis

In [None]:
train_data = pd.read_csv('../input/spaceship-titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('../input/spaceship-titanic/test.csv', index_col='PassengerId')
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.Transported = train_data.Transported.replace({True: 1, False: 0})
train_data.describe()

In [None]:
train_data.isna().sum()

In [None]:
test.isna().sum()

## Preprocess data and feature engineering

In [None]:
train_data['isTrain'] = 'Yes'
test['isTrain'] = 'No'
data = pd.concat([train_data.drop('Transported', axis=1), test])
data.head()

In [None]:
data[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = \
    data[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)
data.Age.median()

In [None]:
data.Age.fillna(27, inplace=True)
data[['CabinDeck', 'CabinNum', 'CabinSide']] = data['Cabin'].str.split('/', expand=True)
data['Services'] = data['RoomService'] + data['FoodCourt'] \
                   + data['FoodCourt'] + data['ShoppingMall'] + data['Spa']
data.head()

In [None]:
data.dtypes

In [None]:
categorical = data.select_dtypes('object').columns.to_list()
categorical

In [None]:
numerical = [column for column in data.columns
             if column not in categorical]
numerical

In [None]:
frquent_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
frquent_imputer.fit(data[categorical])
data[categorical] = frquent_imputer.transform(data[categorical])
data.isna().sum()

In [None]:
data = data.astype({'CryoSleep': 'bool', 'VIP': 'bool', 'CabinNum': 'int64'})
categorical = data.select_dtypes('object').columns.to_list()
numerical = [column for column in data.columns \
             if column not in categorical]
data.dtypes

In [None]:
for column in ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']:
    encoder = LabelEncoder()
    encoder.fit(data[column])
    data[column] = encoder.transform(data[column])

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
data.drop(['Name','Cabin'], axis=1, inplace=True)
data.head()

## Split data into samples

In [None]:
train_data = pd.concat([data[data.isTrain == 'Yes'].drop('isTrain', axis=1),
                        train_data['Transported']], axis=1)
test = data[data.isTrain == 'No'].drop('isTrain', axis=1)
train_data.head()

In [None]:
features = ['CryoSleep', 'Age', 'VIP', 'Services', 'CabinNum']
train_X = train_data.drop('Transported', axis=1)
train_y = pd.DataFrame(train_data.Transported)
train_X.head()

In [None]:
sns.countplot(x=train_data.Transported)
plt.show()

In [None]:
train_data, val_data, train_labels, val_labels = train_test_split(
    train_X, train_y, test_size=0.3, random_state=SEED)
val_labels.info()

## Prepare model

In [None]:
model=CatBoostClassifier(eval_metric='Accuracy', random_state=SEED)
model.fit(train_data, train_labels, silent=True)

In [None]:
val_predicted = pd.DataFrame(model.predict(val_data),
                             columns=['Transported'], index=val_data.index)
val_predicted.head()

In [None]:
print('Base model accuracy: ',
      accuracy_score(val_predicted, val_labels))

## Optimize model hyperparameters

In [None]:
grid = {'iterations': [200, 400, 600, 1000],
        'depth': [3, 4, 5, 6]}
best_model = GridSearchCV(estimator=model, param_grid=grid)
best_model.fit(train_data, train_labels, silent=True)

In [None]:
print('Optimized classificator: ', best_model.best_estimator_)
print('Best accuracy value: ', best_model.best_score_)
print('Optimal parameters: ', best_model.best_params_)

In [None]:
best_predict = pd.DataFrame(best_model.predict(val_data), index=val_data.index)
best_predict.rename({0: 'Transported'}, axis=1, inplace=True)
print('Optimized model accuracy: ', accuracy_score(best_predict, val_labels))

## Record final result

In [None]:
predict = np.array(best_model.predict(test)).astype('bool')

In [None]:
output = pd.DataFrame(columns=['PassengerId', 'Transported'])
output['PassengerId'] = test.index
output['Transported'] = predict
output.to_csv('submission.csv', index=False)

In [None]:
output.head()

In [None]:
output.info()