In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
#df_train = df_train.set_index('PassengerId')
df_train

In [None]:
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
#df_test = df_test.set_index('PassengerId')
df_test

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.isnull().sum()*100/len(df_train)

In [None]:
df_test.isnull().sum()*100/len(df_test)

In [None]:
df_train['isTrain'] = 'Yes'

In [None]:
df_test['isTrain'] = 'No'

In [None]:
tt = pd.concat([df_train, df_test])

In [None]:
tt

In [None]:
tt.dtypes

In [None]:
tt[["C1", 'C2', 'C3']] = tt["Cabin"].str.split('/', expand=True)
tt

In [None]:
tt[["P1", 'P2']] = tt["PassengerId"].str.split('_', expand=True)
tt

In [None]:
tt['Destination'].value_counts()

In [None]:
tt['PassengerId'].value_counts(), tt['P1'].value_counts(), tt['P2'].value_counts()

In [None]:
tt

In [None]:
tt = tt.drop(['Name', 'Transported'], axis = 1)
tt

In [None]:
tt.isnull().sum()

In [None]:
tt.dtypes

In [None]:
cat_cols = list(tt.select_dtypes(include=['category', object]).columns)
cat_cols

In [None]:
num_cols = list(tt.select_dtypes(exclude=['category', object]).columns)
num_cols

## categorical impute

In [None]:
tt_cat = tt[cat_cols]
tt_cat

In [None]:
tt_cat.isnull().sum()

In [None]:
tt_cat.isnull().sum()*100/len(tt_cat)

In [None]:
tt_cat.isnull().sum().sum()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
icat_tt = pd.DataFrame(imputer.fit_transform(tt_cat), columns = tt_cat.columns)
icat_tt

In [None]:
icat_tt.isnull().sum().sum()

## Numerical impute

In [None]:
tt_num = tt[num_cols]
tt_num

In [None]:
tt_num.isnull().sum()*100/len(tt_cat)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
imp = IterativeImputer(verbose=2, max_iter=30, tol=1e-10, imputation_order='roman')

In [None]:
inum_tt = pd.DataFrame(imp.fit_transform(tt_num), columns = tt_num.columns)

In [None]:
inum_tt

In [None]:
imp_tt = pd.concat([icat_tt, inum_tt], axis = 1)

In [None]:
imp_tt

In [None]:
train = imp_tt[imp_tt['isTrain'] == 'Yes']
#train = train.set_index('PassengerId')
train = train.drop('isTrain', axis = 1)
train

In [None]:
df_train['Transported']

In [None]:
train

In [None]:
test = imp_tt[imp_tt['isTrain'] == 'No']
test = test.set_index('PassengerId')
test = test.drop('isTrain', axis = 1)
test

In [None]:
def bool_to_word(bool):
    if bool == True:
        return "Yes"
    else:
        return "No"

In [None]:
train[['CryoSleep', 'VIP']] = train[['CryoSleep', 'VIP']].applymap(bool_to_word)
test[['CryoSleep', 'VIP']] = test[['CryoSleep', 'VIP']].applymap(bool_to_word)

In [None]:
train.dtypes

In [None]:
y = df_train['Transported']
y

In [None]:
train = pd.concat([train, y], axis = 1)

In [None]:
train

In [None]:
train['Transported'] = train['Transported'].map(bool_to_word)

In [None]:
X = train.drop('Transported', axis = 1)
X

In [None]:
y = train['Transported']
y

In [None]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X, y)):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
train.to_csv("train_folds.csv", index=False)

In [None]:
df = pd.read_csv("train_folds.csv")

In [None]:
df

In [None]:
useful_features = [c for c in df.columns if c not in ("PassengerId", "Transported", "kfold")]
test = test[useful_features]

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score

In [None]:
cat_col = list(df.select_dtypes(include=['category', object]).columns)
cat_col.remove('PassengerId')
cat_col.remove('Transported')
cat_col

In [None]:
df.select_dtypes('number').columns

## Catboost model

In [None]:
final_predictions = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.Transported
    yvalid = xvalid.Transported
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    train_dataset = Pool(data=xtrain,
                     label=ytrain,
                     cat_features=cat_col)

    eval_dataset = Pool(data=xvalid,
                    label=yvalid,
                    cat_features=cat_col)
    
    params = {'early_stopping_rounds': 30, 'learning_rate': 0.008785670659077992, 'n_estimators': 1155}
    # Initialize CatBoostClassifier
    model = CatBoostClassifier(**params)
    # Fit model
    model.fit(train_dataset)
    
    test_preds = model.predict(eval_dataset)
    final_predictions.append(test_preds)
    print("=*=" * 10, fold, accuracy_score(yvalid, test_preds), "=*=" * 10)

In [None]:
subb = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
subb

In [None]:
preds = model.predict(test)

In [None]:
subb.Transported = preds
subb

In [None]:
subb.Transported = subb.Transported.map(dict(Yes=True, No=False))
subb

In [None]:
subb.to_csv('subb_cat.csv', index = None)