In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_columns', None)
sns.set_theme(style="darkgrid")

In [None]:
train_data_raw = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data_raw = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Converting data types

In [None]:
train_data_raw.info()

## Things to do
- Fill na values by most frequent strategy
- Onehotencoding
- Ordinal encoding
- Cabin data manipulation
- making new feature - age_group
- deal with outliers
    - how? - change them to 2.5 of std
    - or find log of the features
    - or encode them as boolean
    
## Columns that will not be used - 'Name', 'PassengerId'

# Not doing Visualizing in this one. Here is the link that has good visualization of this data

In [None]:
data1 = train_data_raw.copy()

attr_for_le = ['CryoSleep','Transported','VIP']

# 'CabinNum' not using in onehotencoding because too many features
attr_for_ohe = ['HomePlanet','Destination', 'CabinDeck',
               'age_group','CabinSide'] # new features to 
# be added after sorting Cabin
attr_numerical = ['Age','RoomService','FoodCourt',
                  'ShoppingMall', 'Spa','VRDeck']
all_attr = list(data1.columns)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class CatTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        idx = X.index
        cols = list(X.columns)
        imputer = SimpleImputer(strategy='most_frequent')
        X[:] = imputer.fit_transform(X)
        # print(X.head(5))
        # X = pd.DataFrame(im_tr, columns=cols, index=idx)
        
        X['CabinDeck'] = [s.split('/')[0] for s in X['Cabin']]
        X['CabinNum'] = [s.split('/')[1] for s in X['Cabin']]
        X['CabinSide'] = [s.split('/')[2] for s in X['Cabin']]
        
        def decide_age_group(a):
            if a<=5:
                return 'infant'
            elif 5<a<14:
                return 'child'
            elif 14<=a<26:
                return 'youth'
            elif 26<=a<=60:
                return 'adult'
            elif a>60:
                return 'elder'
            
        X['age_group'] = X['Age'].apply(decide_age_group)
        
        onehot = OneHotEncoder(sparse=False)
        onehot_tr = onehot.fit_transform(X[attr_for_ohe])
        ohe_cols = list(onehot.get_feature_names_out())
        onehot_df = pd.DataFrame(onehot_tr, columns=ohe_cols,
                                index =idx)
        
        X = X.drop(attr_for_ohe, axis = 1)
        X = X.drop('Cabin',axis=1)
        print("done dropping ")
        X = pd.concat([X, onehot_df,], axis=1)
        print('Done concate')
        return X

In [None]:
from sklearn.preprocessing import OrdinalEncoder

class ForOE(BaseException, TransformerMixin):
    def __init__(self, feature_names):
        self._feature_names = feature_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # print(X.head())
        oe = OrdinalEncoder()
        # X[:] = oe.fit_transform(X[self._feature_names])
        for col in self._feature_names:
            if 'Transported' != list(X.columns):
                continue
            col_val = X[col].values.reshape(-1,1)
            print('\ncol_val',col_val[10])
            X[col] = oe.fit_transform(col_val)
        print(oe.get_params())
        return X
    
class NumTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, method):
        self.method = method
        # error
        # TypeError: NumTransformation() takes no arguments
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.method == 'to_bool':
            def num_to_bool(a):
                if a>0.0:
                    return 1
                elif a==0.0:
                    return 0
            for col in attr_numerical:
                if 'Transported' != list(X.columns):
                    continue
                X[col] = X[col].apply(num_to_bool)
            return X
        elif self.method == 'log':
            for col in [attr_numerical]:
                X[col] = np.log1p(X[col])
            return X
        
        # not doing because i feel lazy
        # elif self.method == 'boxcox':
            

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

ft = ['CryoSleep','VIP','Transported']

ct = Pipeline([
    ('cat_tr', CatTransformer()),
    ('oe', ForOE(ft)),
    ('num', NumTransformation(method='log')),
])

In [None]:
data2 = ct.fit_transform(data1)
data2.isna().sum()

In [None]:
data2.describe()

In [None]:
corr = data2.drop([ 'PassengerId','Name'], axis=1).corr()
corr['Transported'].sort_values(ascending=False)

In [None]:
test_data_copy = test_data_raw.copy()
test_prepared = ct.fit_transform(test_data_copy)
test_prepared.isna().sum()

In [None]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier


# best_clf = RandomForestClassifier(min_samples_leaf=4, min_samples_split=6)
best_gbc = GradientBoostingClassifier(loss='exponential', max_depth=15, n_estimators=200,
                           subsample=1)

bagging = BaggingClassifier(best_gbc)
    
X = data2.drop(['Transported', 'PassengerId','Name'], axis=1)
y= data2['Transported']

# best_clf.fit(X, y)
bagging.fit(X, y)
print(bagging.score(X,y))

In [None]:
test_prepared_x = test_prepared.drop(['PassengerId','Name'], axis=1)
# preds = best_clf.predict(test_prepared_x)
preds = bagging.predict(test_prepared_x)

In [None]:
sub = pd.DataFrame({
    'PassengerId': test_prepared['PassengerId'],
    'Transported': preds,
})

submission = sub.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('submission.csv')
submission