## Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

## Read data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
sub_lr = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

## Feature engineering

In [None]:
target = train['Survived']
train.drop('Survived', axis = 1, inplace = True)

df = pd.concat([train, test], axis = 0, ignore_index = True)

In [None]:
# categorical
feat_cat = [col for col in df.columns if df[col].dtypes == 'object']
feat_cat

In [None]:
# Embarked, fillna with 'X' value
df['Embarked'] = df['Embarked'].fillna('X')

# Ticket, fillna with 'X', split string and take first split 
df['Ticket'] = df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

# Cabin, fillna with 'X' and take first letter
df['Cabin'] = df['Cabin'].fillna('X').map(lambda x: x[0].strip())

# Name, take only surnames
df['Name'] = df['Name'].map(lambda x: x.split(',')[0])

In [None]:
# numerical
feat_num = df.dtypes[df.dtypes != "object"]
feat_num

In [None]:
# family
df['Family'] = df['SibSp'] + df['Parch'] + 1 
def f(col):
    if col == 1:
        val = 'Single'
    elif col == 2:
        val = 'Couple'
    elif col == 3:
        val = 'Small_family'
    elif col in [4, 5]:
        val = 'Mid_family'
    elif col in [6, 7]:
        val = 'Big_family'
    else:
        val = 'Super_family'
    return val
df['FamilySize'] = df['Family'].apply(f)

# age, fare
aux = df.groupby(["Pclass","Embarked","Sex"])[["Age","Fare"]].mean()
df["MultiIndex"] = pd.MultiIndex.from_frame(df[["Pclass","Embarked","Sex"]])
df.loc[df["Age"].isna(),"Age"] = df.loc[df["Age"].isna(),"MultiIndex"].map(aux["Age"])
df.loc[df["Fare"].isna(),"Fare"] = df.loc[df["Fare"].isna(),"MultiIndex"].map(aux["Fare"])
df.drop(columns=["MultiIndex"], inplace=True)

def f(col):
    if col < 10:
        val = '10s'
    elif 10 <= col < 20:
        val = '20s'
    elif 20 <= col < 30:
        val = '30s'
    elif 30 <= col < 40:
        val = '40s'
    elif 40 <= col < 50:
        val = '50s'
    elif 50 <= col < 60:
        val = '60s'
    elif 60 <= col < 70:
        val = '70s'
    elif 70 <= col < 80:
        val = '80s'
    elif 80 <= col < 90:
        val = '90s'
    else:
        val = '100s'
    return val
df['AgeSize'] = df['Age'].apply(f)

def f(col):
    if col < 5:
        val = 'Super_cheap'
    elif 5 <= col < 10:
        val = 'Very_cheap'
    elif 10 <= col < 20:
        val = 'Cheap'
    elif 20 <= col < 40:
        val = 'Moderate'
    elif 40 <= col < 100:
        val = 'Expensive'
    elif 100 <= col < 200:
        val = 'Very_expensive'
    elif 200 <= col < 300:
        val = 'Super_expensive'
    else:
        val = 'Mega_expensive'
    return val
df['FareSize'] = df['Fare'].apply(f)

In [None]:
comb = list(itertools.combinations(['Pclass', 'Sex', 'Cabin', 'Embarked', 'FamilySize', 'AgeSize', 'FareSize'], 2))
comb

In [None]:
for c1, c2 in comb:
    df.loc[:, c1 + '_' + c2] = df[c1].astype(str) + '_' + df[c2].astype(str)

## Encoding

In [None]:
feat_num = ['Age', 'SibSp', 'Pclass', 'Parch', 'Fare', 'Family']
feat_onehot = ['Cabin', 'Embarked',
               'Pclass_Sex', 'Pclass_Cabin', 'Pclass_Embarked', 'Sex_Cabin', 'Sex_Embarked', 'Cabin_Embarked']
feat_label = ['Sex','FamilySize', 'AgeSize', 'FareSize'] #['Name', 'Ticket']

In [None]:
def label_encoder(col):
    le = LabelEncoder()
    return le.fit_transform(col)

df_num = df[feat_num]
df_onehot = pd.get_dummies(df[feat_onehot])
df_label = df[feat_label].apply(label_encoder)

In [None]:
#df_all = pd.concat([df_num, df_onehot], axis=1)
df_all = pd.concat([df_num, df_onehot, df_label], axis=1)

df_all.isnull().values.sum(), df_all.shape

In [None]:
X = df_all[:train.shape[0]].values
y = target.values

X_test = df_all[train.shape[0]:].values

## Logistic Regression

In [None]:
params = {'penalty': 'l2', 'C': 15.724316694262722, 'class_weight': {0: 1.1, 1: 1}}
 
model = LogisticRegression(**params, solver = 'lbfgs', random_state = 1337)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1337)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Average Accuracy: {np.mean(scores)}')

In [None]:
model.fit(X, y)
preds = model.predict(X_test)

In [None]:
# Saving the result
sub_lr['Survived'] = preds
sub_lr.to_csv("submission_lr.csv", index=False)

In [None]:
sub_lr['Survived'].hist()