# Related resources 

Features:
* https://www.kaggle.com/hiro5299834/tps-apr-2021-single-decisiontreemodel
* https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv
* https://www.kaggle.com/dwin183287/tps-april-2021-models-feature-enginering
* https://www.kaggle.com/hiro5299834/tps-apr-2021-voting-pseudo-labeling

LogisticRegression tuning:
* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
* https://stackoverflow.com/questions/21816346/fine-tuning-parameters-in-logistic-regression/2181881

# Loading Libraries and Data

In [None]:
import numpy as np
import pandas as pd

from scipy.stats import skew, boxcox

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder,RobustScaler, PowerTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, f1_score

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

# EDA

In [None]:
plt.subplots(figsize=(6,4))
sns.countplot(x='Survived', data=train)
plt.title("Count of Survival")
plt.show()

In [None]:
print(f"{len(train[train['Survived']==1])/len(train)}% survived")

In [None]:
train.info()

In [None]:
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(12, 10))
    ax = sns.heatmap(train[['Survived','Pclass','Age','SibSp','Parch','Fare']].corr(method='spearman'), annot=True, square=True, vmin=-1, vmax=1)#

Fare, Age positively correlate with Survival, Pclass - negatively. 

Interesting correlation Pclass-Fare, likely meaning higher class number -- lower fare.

## Impact of categorical features on survival

In [None]:
plt.subplots(figsize=(8,6))
sns.countplot(x='Survived', data=train, hue='Sex')
plt.title("Impact of Sex on Survival")
plt.show()

Female passengers in train data survived in larger numbers than males.

In [None]:
plt.subplots(figsize=(8,6))
sns.countplot(x='Survived', data=train, hue='Embarked')
plt.title("Impact of Embarked on Survival")
plt.show()

Passengers embarked in Q and C in train data survived in larger numbers than those embarked in S.

In [None]:
plt.subplots(figsize=(8,6))
sns.countplot(x='Survived', data=train, hue='Pclass')
plt.title("Impact of Pclass on Survival")
plt.show()

Pclass 3 has significantly less chances of survival than 1 and 2 passenger classes.

# Data pre-processing

In [None]:
y = train.Survived.values

train.drop(['Survived','PassengerId'], axis=1, inplace=True)
test.drop(['PassengerId'], axis=1, inplace=True)
print(f"train size is : {train.shape}")
print(f"test size is : {test.shape}")

# Imputing

In [None]:
df_na = (train.isnull().sum() / len(train)) * 100
df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' : df_na})
missing_data.head(5)

In [None]:
test.Cabin    = test.Cabin.fillna('0')
train.Cabin   = train.Cabin.fillna('0')

train.Ticket  = train.Ticket.fillna(train.Ticket.mode()[0])
test.Ticket   = test.Ticket.fillna(test.Ticket.mode()[0])

train.Age      = train.Age.fillna(train.Age.median())
test.Age       = test.Age.fillna(train.Age.median())

train.Embarked = train.Embarked.fillna(train.Embarked.mode()[0])
test.Embarked  = test.Embarked.fillna(train.Embarked.mode()[0])

train.Fare     = train.Fare.fillna(train.Fare.mean())
test.Fare      = test.Fare.fillna(test.Fare.mean())

In [None]:
train.isnull().sum()

# Feature Engineering

In [None]:
train['Cabin_'] = train['Cabin'].map(lambda x: x[0].strip())
test['Cabin_'] = test['Cabin'].map(lambda x: x[0].strip())

train['HasCabin'] = train.Cabin.apply(lambda x: 0 if x=='0' else 1).astype('category')
test['HasCabin'] = test.Cabin.apply(lambda x: 0 if x=='0' else 1).astype('category')

train['Ticket_'] = train['Ticket'].str.replace('[^\w\s]','').replace(' ','').fillna('NA').replace('(\d)', '', regex=True)
test['Ticket_'] = test['Ticket'].str.replace('[^\w\s]','').replace(' ','').fillna('NA').replace('(\d)', '', regex=True)

train['FirstName'] = train['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()
test['FirstName'] = test['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()

train['IsWoman'] = (train['Sex']=='female').astype('category')
test['IsWoman'] = (test['Sex']=='female').astype('category')

train['FamilySize'] = train['SibSp'] + train['Parch']
test['FamilySize'] = test['SibSp'] + test['Parch']

train.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)
test.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)

In [None]:
numeric_feats = train.dtypes[(train.dtypes != "object") & (train.dtypes != 'category')].index.tolist()
object_feats  = train.dtypes[(train.dtypes == "object") | (train.dtypes == 'category')].index.tolist()

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
print(skewness)

for i in skewness.index:
    train[i]=np.log1p(train[i])
    test[i]=np.log1p(test[i])

# Modeling

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), object_feats),
        ('num', RobustScaler() , numeric_feats)
    ])

In [None]:
clf = Pipeline(steps=[
                    ('pre', preprocessor),
                    ('a', LogisticRegression(random_state=42)),
                    ])

In [None]:
param_grid = {
    'a__C': list(np.linspace(0.05, 0.07, 20)),
    'a__max_iter': list(range(40, 120, 10)),
    'a__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [None]:
a = GridSearchCV(clf, param_grid, scoring='accuracy', cv=10).fit(train, y)
a.best_estimator_

In [None]:
a.best_estimator_.fit(train, y)
predictions = a.best_estimator_.predict(test)

In [None]:
sub['Survived'] = predictions
sub.to_csv('submission.csv',index=False)