In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
test['Survived'] = np.nan
test = test[train.columns]

## Determine Preprocessing Steps

Try out each step and determing a good pipeline.

Then build the pipeline out of sklearn pipeline.

### Inspect Data

In [None]:
print(train.isna().mean()*train.shape[0])
print(train.dtypes)

#### Inspect Numerical Features

In [None]:
age_df = train[['Age', 'Survived', 'Sex', 'Pclass']].copy()
age_df["Age_trunc"] = (age_df["Age"]//2)*2
age_df['Sex'] = [1 if val=="male" else 0 if val=="female" else 2 for val in age_df['Sex']]

In [None]:
import matplotlib.pyplot as plt
age_df[['Age_trunc', 'Sex', 'Survived']].groupby(['Age_trunc', 'Sex']).mean().reset_index().plot.scatter(x="Age_trunc", 
                                                                                                         y="Survived", 
                                                                                                         #c="Sex", 
                                                                                                         alpha=0.5,
                                                                                                         #cmap = plt.cm.Spectral,
                                                                                                        )


In [None]:
age_df[['Pclass', 'Sex', 'Survived']].groupby(['Pclass', 'Sex']).mean().plot.bar()

Age shows an unexpected trend, with teenager and the very old being most likely to survive. People between 16-40, and under 10 were the least likely to survive.

#### Inspect Categorical Features

In [None]:
train['Embarked'].value_counts(dropna=False)

In [None]:
train['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split(" ")) > 1 else 'X').value_counts()

In [None]:
train[['Parch']].value_counts()

In [None]:
train[['SibSp']].value_counts()

### Conclusions

We can see that the preprocessing needed is:

* handling nans for: age, ticket, fare, cabin and embarked
    * numerical columns (age, fare): impute, add nan indicator column
    * string columns (ticket, cabin, embarked): create a new string for the nan indicator
* encode categorical columns:
    * transform and encode string columns: sex, ticket, cabin, embarked
    * encode categories: SibSp, Parch (that have around 10 categ values each)
    * name column will be ignored for now

## Build Sklearn pipeline for feature transformations

In [None]:
# indicate categ and numerical features
categ_feats = ['Sex', 'Pclass', 'SibSp', 'Parch', 'Cabin', 'Embarked', 'Ticket']
num_feats = ['Age', 'Fare']

In [None]:
train.head()

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

# class to convert all categ columns to strings and transform strings
# into category codes
class StrFunctionTransformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.func_map = {"Cabin": self.transform_cabin,
                         "Ticket": self.transform_ticket,
                         "Embarked": self.transform_emb}

    def transform(self, input_df, **transform_params):
        df = input_df.copy()
        for coln in df.columns:
            if coln in self.func_map.keys():
                new_col = self.func_map[coln](df)
                df[coln] = new_col.astype(str)
        return df

    def fit(self, X, y=None, **fit_params):
        return self

    def transform_cabin(self, df):
        coln = "Cabin"
        col = df.loc[:,coln]
        new_col = col.fillna("Z0").map(lambda x: x[0].strip())
        return new_col

    def transform_ticket(self, df):
        coln = "Ticket"
        col = df.loc[:,coln]
        new_col = col.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
        return new_col
    
    def transform_emb(self, df):
        coln = "Embarked"
        col = df.loc[:,coln]
        new_col = col.fillna('NA')
        return new_col


In [None]:
# test the string col transformer
dft = StrFunctionTransformer()
dft.fit_transform(train)

In [None]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

# construct pipelines for feature processing
num_pipe = make_union(MissingIndicator(), 
                       make_pipeline(SimpleImputer(strategy='median'), StandardScaler()))
num_trans = make_column_transformer((num_pipe, num_feats))
categ_trans = make_column_transformer((StrFunctionTransformer(), categ_feats))

# encode categ feats
encoder = ce.TargetEncoder(return_df=True) # encodes all string columns
categ_pipe = make_pipeline(categ_trans, encoder)

# make union of categ and numerical features
features_union = make_union(categ_pipe, num_trans)

In [None]:
# check the piepline worked
feats = features_union.fit_transform(train, train['Survived'])

# you can use the transformer to transform just one step
features_union.transformer_list[0][1].steps[0][1].transform(test)

In [None]:
# check that it can transform the test df
features_union.transform(test)

### Inspect Correlations of New Features

In [None]:
# inspect correlation of new features
pd.DataFrame(feats).corrwith(train["Survived"]).plot.bar()

In [None]:
feats_df = pd.DataFrame(feats)
feats_df["Survived"] = train["Survived"]

In [None]:
feats_df.groupby('Survived').mean().T.plot.bar()

### Conclusions

Some features could be removed: 2, 7, 8

## Setup sklearn classifier

In [None]:
from imblearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer, StandardScaler, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier, VotingClassifier
import xgboost as xgb

class Classifier:
    def __init__(self, num_feats, categ_feats):
        # classifiers that could be tested
#         XGB = xgb.XGBClassifier()
#         ETC = ExtraTreesClassifier()
#         RFC = RandomForestClassifier(n_estimators=1000, random_state=0)
#         DTC = DecisionTreeClassifier()
#         KNN = KNeighborsClassifier()
#         BC = BaggingClassifier(base_estimator=LinearSVC(), n_estimators=100, random_state=0)
#         CLASS = VotingClassifier(estimators=[('SVC', SVC),('RFC',RFC), ('KNN', KNN), ('BC', BC)])
        SVC = LinearSVC(max_iter=2000, random_state = 0)
        self.features_union = self.build_prepro_pipe(num_feats, categ_feats)
        
        self._pipeline = Pipeline(steps=[('preprocessor', self.features_union), ('classifier', SVC)])
        
    @staticmethod
    def build_prepro_pipe(num_feats, categ_feats):
        num_pipe = make_union(MissingIndicator(), 
                       make_pipeline(SimpleImputer(strategy='median'), StandardScaler()))
        num_trans = make_column_transformer((num_pipe, num_feats))
        categ_trans = make_column_transformer((StrFunctionTransformer(), categ_feats))

        # encode categ feats
        encoder = ce.TargetEncoder(return_df=True) # encodes all string columns
        categ_pipe = make_pipeline(categ_trans, encoder)

        # make union of categ and numerical features
        features_union = make_union(categ_pipe, num_trans)
        return features_union

    def train(self, x, y):
        self.classifier = self._pipeline
        self.classifier.fit(x,y)
        
    def transform(self, x):
        return self._pipeline.steps[0][1].transform(x)

    def predict(self, x):
        return self.classifier.predict(x)

## Train and Predict

In [None]:
clsi = Classifier(num_feats, categ_feats)

In [None]:
clsi.train(train, train["Survived"])

In [None]:
preds = clsi.predict(test)

In [None]:
# the transformed test can be seen from teh pipeline
clsi.transform(test)

In [None]:
print(preds.mean())
print(train["Survived"].mean())

In [None]:
test['Survived'] = preds

In [None]:
test.groupby('Survived').mean()

In [None]:
test.loc[: , ['PassengerId', 'Survived']].to_csv("submission.csv", index=False)