EDA and logistic regression notebook:
* https://www.kaggle.com/pavfedotov/tps-april-eda-logistic-regression

Notebooks on features:
* https://www.kaggle.com/hiro5299834/tps-apr-2021-single-decisiontreemodel
* https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv
* https://www.kaggle.com/dwin183287/tps-april-2021-models-feature-enginering

## Loading data and libraries

In [None]:
import numpy as np
import pandas as pd

from scipy.stats import skew

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder,RobustScaler, PowerTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression

import warnings
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

# Data pre-processing

In [None]:
y = train.Survived.values

train.drop(['Survived','PassengerId'], axis=1, inplace=True)
test.drop(['PassengerId'], axis=1, inplace=True)
print(f"train size is : {train.shape}")
print(f"test size is : {test.shape}")

## Imputing

In [None]:
test.Cabin     = test.Cabin.fillna('0')
train.Cabin    = train.Cabin.fillna('0')

train.Ticket   = train.Ticket.fillna(train.Ticket.mode()[0])
test.Ticket    = test.Ticket.fillna(test.Ticket.mode()[0])

train.Age      = train.Age.fillna(train.Age.median())
test.Age       = test.Age.fillna(train.Age.median())

train.Embarked = train.Embarked.fillna(train.Embarked.mode()[0])
test.Embarked  = test.Embarked.fillna(train.Embarked.mode()[0])

train.Fare     = train.Fare.fillna(train.Fare.mean())
test.Fare      = test.Fare.fillna(test.Fare.mean())

## Feature Engineering

In [None]:
train['HasCabin'] = train.Cabin.apply(lambda x: 0 if x=='0' else 1).astype('category')
test['HasCabin'] = test.Cabin.apply(lambda x: 0 if x=='0' else 1).astype('category')

train['Ticket_'] = train['Ticket'].str.replace('[^\w\s]','').replace(' ','').fillna('NA').replace('(\d)', '', regex=True).astype('category')
test['Ticket_'] = test['Ticket'].str.replace('[^\w\s]','').replace(' ','').fillna('NA').replace('(\d)', '', regex=True).astype('category')

train['FirstName'] = train['Name'].str.split(',').str[1].str.split('.').str[0].str.strip().astype('category')
test['FirstName'] = test['Name'].str.split(',').str[1].str.split('.').str[0].str.strip().astype('category')

train['FamilySize'] = train['SibSp'] + train['Parch']
test['FamilySize'] = test['SibSp'] + test['Parch']

train.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)
test.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)

In [None]:
numeric_feats = train.dtypes[(train.dtypes != "object") & (train.dtypes != 'category')].index.tolist()
object_feats  = train.dtypes[(train.dtypes == "object") | (train.dtypes == 'category')].index.tolist()

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

for i in skewness.index:
    train[i]=np.log1p(train[i])
    test[i]=np.log1p(test[i])

# Modeling

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), object_feats),
        ('num', RobustScaler() , numeric_feats)
    ])

## Pipeline with Gradient Boosting
Sklearn Documentation on GradientBoosting: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

Related notebook on gridsearch: https://www.kaggle.com/hatone/gradientboostingclassifier-with-gridsearchcv

In [None]:
clf = Pipeline(steps=[
                    ('pre', preprocessor),
                    ('a', GradientBoostingClassifier(random_state=42)),
                    ])

In [None]:
param_grid = {
    'a__n_estimators': list(range(50,250,50)),
    'a__learning_rate': [0.155, 0.16, 0.165, 0.17],
    "a__max_depth": list(range(3,7,1)),
    #"a__max_features":["log2","sqrt"],
    #"a__criterion": ["friedman_mse",  "mae"],
}

In [None]:
a = GridSearchCV(clf, param_grid,scoring='balanced_accuracy', cv=10).fit(train, y)
a.best_estimator_

In [None]:
a.best_estimator_.fit(train, y)
predictions = a.best_estimator_.predict(test)

In [None]:
sub['Survived'] = predictions
sub.to_csv('submission.csv',index=False)