#### *Aim to bring Logistic Regression acc score to 80%*
#### *The pl data comes from [notebook link](https://www.kaggle.com/kalashnimov/logistic-regression-baseline)*

## Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import itertools
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

## Setting

In [None]:
TARGET = 'Survived'
N_SPLITS = 5
N_REPEATS = 3
SEED = 267

## Read data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
sub_lr = pd.read_csv('../input/tps-apr-lr-baseline/submission_lr.csv') 

In [None]:
test[TARGET] = sub_lr[TARGET]

df = pd.concat([train, test], axis = 0, ignore_index = True)

## Feature engineering

In [None]:
# categorical
feat_cat = [col for col in df.columns if df[col].dtypes == 'object']
feat_cat

In [None]:
# Embarked, fillna with 'X' value
df['Embarked'] = df['Embarked'].fillna('X')

# Ticket, fillna with 'X', split string and take first split 
df['Ticket'] = df['Ticket'].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

# Cabin, fillna with 'X' and take first letter
df['Cabin'] = df['Cabin'].fillna('X').map(lambda x: x[0].strip())

# Name, take only surnames
df['Name'] = df['Name'].map(lambda x: x.split(',')[0])

In [None]:
# numerical
feat_num = df.dtypes[df.dtypes != "object"]
feat_num

In [None]:
# family
df['Family'] = df['SibSp'] + df['Parch'] + 1 
def f(col):
    if col == 1:
        val = 'Single'
    elif col == 2:
        val = 'Couple'
    elif col == 3:
        val = 'Small_family'
    elif col in [4, 5]:
        val = 'Mid_family'
    elif col in [6, 7]:
        val = 'Big_family'
    else:
        val = 'Super_family'
    return val
df['FamilySize'] = df['Family'].apply(f)

# age, fare
aux = df.groupby(["Pclass","Embarked","Sex"])[["Age","Fare"]].mean()
df["MultiIndex"] = pd.MultiIndex.from_frame(df[["Pclass","Embarked","Sex"]])
df.loc[df["Age"].isna(),"Age"] = df.loc[df["Age"].isna(),"MultiIndex"].map(aux["Age"])
df.loc[df["Fare"].isna(),"Fare"] = df.loc[df["Fare"].isna(),"MultiIndex"].map(aux["Fare"])
df.drop(columns=["MultiIndex"], inplace=True)

def f(col):
    if col < 10:
        val = '10s'
    elif 10 <= col < 20:
        val = '20s'
    elif 20 <= col < 30:
        val = '30s'
    elif 30 <= col < 40:
        val = '40s'
    elif 40 <= col < 50:
        val = '50s'
    elif 50 <= col < 60:
        val = '60s'
    elif 60 <= col < 70:
        val = '70s'
    elif 70 <= col < 80:
        val = '80s'
    elif 80 <= col < 90:
        val = '90s'
    else:
        val = '100s'
    return val
df['AgeSize'] = df['Age'].apply(f)

def f(col):
    if col < 5:
        val = 'Super_cheap'
    elif 5 <= col < 10:
        val = 'Very_cheap'
    elif 10 <= col < 20:
        val = 'Cheap'
    elif 20 <= col < 40:
        val = 'Moderate'
    elif 40 <= col < 100:
        val = 'Expensive'
    elif 100 <= col < 200:
        val = 'Very_expensive'
    elif 200 <= col < 300:
        val = 'Super_expensive'
    else:
        val = 'Mega_expensive'
    return val
df['FareSize'] = df['Fare'].apply(f)

In [None]:
comb = list(itertools.combinations(['Pclass', 'Sex', 'Cabin', 'Embarked', 'FamilySize', 'AgeSize', 'FareSize'], 2))
comb

In [None]:
for c1, c2 in comb:
    df.loc[:, c1 + '_' + c2] = df[c1].astype(str) + '_' + df[c2].astype(str)

## Encoding

In [None]:
feat_num = ['Age', 'SibSp', 'Pclass', 'Parch', 'Fare', 'Family']
feat_onehot = ['Cabin', 'Embarked',
               'Pclass_Sex', 'Pclass_Cabin', 'Pclass_Embarked', 'Pclass_FamilySize', 'Pclass_AgeSize', 'Pclass_FareSize',
               'Sex_Cabin', 'Sex_Embarked', 'Sex_FamilySize', 'Sex_AgeSize', 'Sex_FareSize', 'Cabin_Embarked', 'Cabin_FamilySize', 'Cabin_AgeSize', 'Cabin_FareSize',
               'Embarked_FamilySize', 'Embarked_AgeSize', 'Embarked_FareSize', 'FamilySize_AgeSize', 'FamilySize_FareSize', 'AgeSize_FareSize']
feat_label = ['Sex', 'FamilySize', 'AgeSize', 'FareSize'] #['Name', 'Ticket']

In [None]:
def label_encoder(col):
    le = LabelEncoder()
    return le.fit_transform(col)

sc = StandardScaler()

#df_num  = pd.DataFrame(sc.fit_transform(df[feat_num]), columns = feat_num)
df_num = df[feat_num]
df_onehot = pd.get_dummies(df[feat_onehot])
df_label = df[feat_label].apply(label_encoder)
df_target = df[TARGET]

In [None]:
df_all = pd.concat([df_num, df_onehot, df_label], axis=1)

df_all.isnull().values.sum(), df_all.shape

In [None]:
X = df_all.values
y = df_target.values

## Logistic Regression

In [None]:
lr_oof = np.zeros((train.shape[0], N_REPEATS))
lr_preds = np.zeros((test.shape[0], N_REPEATS))

rskf = RepeatedStratifiedKFold(n_splits = N_SPLITS, n_repeats = N_REPEATS, random_state = SEED)
for fold, (train_idx, valid_idx) in enumerate(rskf.split(X, y)):
    print("=> Fold {}".format(fold + 1))

    oof_idx = np.array([idx for idx in valid_idx if idx < train.shape[0]])
    test_idx = np.array([idx for idx in valid_idx if idx >= train.shape[0]])

    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[oof_idx], y[oof_idx]
    x_test = X[test_idx]
    
    params = {'penalty': 'l2', 'C': 83.79260077891932, 'class_weight': {0: 1.05, 1: 1}}
    model = LogisticRegression(**params, random_state = SEED)
    
    model.fit(x_train, y_train)
    
    lr_oof[oof_idx, fold//N_SPLITS] = model.predict(x_valid)
    acc_oof = accuracy_score(y_valid, lr_oof[oof_idx, fold//N_SPLITS])
    lr_preds[test_idx - train.shape[0], fold//N_SPLITS] = model.predict(x_test)
    print(f"ACC SCORE {acc_oof:.4f} \n")    
    if fold in [(i+1) * N_SPLITS - 1 for i in range(N_REPEATS)]:
        acc_overall = accuracy_score(y[:train.shape[0]], lr_oof[:, fold//N_SPLITS])
        print(f"=> OVERALL ACC SCORE: {acc_overall:.4f} \n") 

In [None]:
preds = lr_preds.sum(axis = 1)

collections.Counter(preds)

## Submit

In [None]:
sub['Survived'] = np.where(preds > N_REPEATS//2, 1, 0).astype(int)
sub.to_csv("submission.csv", index = False)

In [None]:
sub['Survived'].hist()

## Referenced

* https://www.kaggle.com/hiro5299834/tps-apr-2021-voting-pseudo-labeling