In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv", index_col="PassengerId")
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv", index_col="PassengerId")

In [None]:
y = train["Survived"]
train.drop(columns=["Survived"], inplace=True)
df = pd.concat([train,test])

We are going to use the following 4 variables.

In [None]:
df = df[["Pclass","Sex","Embarked","Cabin"]]

In [None]:
df.isna().sum()

### Cabin variable. Extract the first letter and fill NaN

In [None]:
df["Cabin"] = df["Cabin"].str[0]
df["Cabin"] = df["Cabin"].fillna("N")
df["Cabin"].value_counts()

In [None]:
(df[:len(train)].join(y)).groupby(["Cabin"]).agg({"Cabin":"count",'Survived': 'mean'})

There are very few cases of T, G and F categories, so we are going to set T as N and join G and F.

In [None]:
df.loc[df["Cabin"]=="T","Cabin"] = "N"
df.loc[df["Cabin"]=="G","Cabin"] = "F"

In [None]:
(df[:len(train)].join(y)).groupby(["Cabin"]).agg({"Cabin":"count",'Survived': 'mean'})

## Fill NaN of the Embarked variable

Fill each NaN with the most common category (C, S or Q) of its respective subgroup.

In [None]:
df.groupby(["Pclass","Sex","Cabin"])["Embarked"].agg(lambda x:x.value_counts().index[0])

In [None]:
aux = df.groupby(["Pclass","Sex","Cabin"])["Embarked"].agg(lambda x:x.value_counts().index[0])
df["MultiIndex"] = pd.MultiIndex.from_frame(df[["Pclass","Sex","Cabin"]])
df.loc[df["Embarked"].isna(),"Embarked"] = df.loc[df["Embarked"].isna(),"MultiIndex"].map( aux )
df.drop(columns=["MultiIndex"],inplace=True)

In [None]:
df.isna().sum()

## Target Encoding

We are going to encode the four variables with the mean of the target.

In [None]:
def target_encoding(features,df,y):
    for i in features:
        mean = df[:len(train)].join(y).groupby(i)["Survived"].mean()
        df[i] = df[i].map(mean)

In [None]:
target_encoding(df.columns,df,y)

## Feature Creation

Here we are going to add the standard deviation variable and encode it with the target.

In [None]:
df["std"] = df.std(axis=1)

In [None]:
df["std"] = pd.qcut(df["std"],5)
mean = df[:len(train)].join(y).groupby("std")["Survived"].mean()
df["std"] = df["std"].map(mean)
df["std"] = df["std"].astype(float)

## 1. Visualization variable vs target

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
target_mask = y == 1
non_target_mask = y == 0

In [None]:
for col in df.columns:
    fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    sns.kdeplot(df[:len(train)].loc[non_target_mask, col], ax=ax, label='Target == 0')
    sns.kdeplot(df[:len(train)].loc[target_mask, col], ax=ax, label='Target == 1')

    ax.set_title('name: {}'.format(col))
    plt.show()

In [None]:
df.corr()

## 2. Models

### 2.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[:len(train)], y, test_size=1000)

In [None]:
lr = LogisticRegression(max_iter=300)
lr.fit(x_train, y_train)

In [None]:
print("Test accuracy: ",accuracy_score(y_test, lr.predict(x_test)), ", Train accuracy: ", accuracy_score(y_train, lr.predict(x_train)))

We can see the weights of the model for the variables:

In [None]:
pd.DataFrame({"Variable":df.columns, "Weights": lr.coef_.round(2).reshape(-1)})

Now we are going to train the logistic regression in all the training set and submit.

In [None]:
lr.fit(df[:len(train)], y)

In [None]:
accuracy_score(y, lr.predict(df[:len(train)]))

In [None]:
pd.DataFrame({"Variable":df.columns, "Weights": lr.coef_.round(2).reshape(-1)})

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv", index_col = "PassengerId")
sub["Survived"] = lr.predict(df[len(train):])
sub.to_csv('sub_lg_4var.csv')