In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Libraries import

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data loading

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', index_col=['PassengerId'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv', index_col=['PassengerId'])

In [None]:
train.head()

In [None]:
test.head()

## Check info on train and test datasets

In [None]:
train.info()

From above information, we can see that there are missing values in following columns:
+ Age
+ Ticket
+ Fare
+ Cabin
+ Embarked

In [None]:
test.info()

We have exact similar columns in test set which have missing values.

In [None]:
# Check missing % of values in columns
print('Missing percentage in Training data')
100 * train.isna().sum() / train.shape[0]

In [None]:
# Check missing % of values in columns
print('Missing percentage in Testing data')
100 * test.isna().sum() / test.shape[0]

We have similar pattern of missing values in training and testing datasets

In [None]:
train.nunique()

From above unique value counts, we can surely get rid of 'Name', 'Ticket' columns

In [None]:
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

## Target distribution

In [None]:
train.Survived.value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Survived', data=train)
plt.title('Survived count')
plt.show()

## Check other columns effect on Survived

In [None]:
all_cols = train.columns
all_cols

In [None]:
feature_cols = train.drop('Survived', axis=1).columns
feature_cols

In [None]:
train.head()

## Continuous columns

In [None]:
# Continuous columns
cont_cols = ['Age','Fare']
train[cont_cols].describe()

### Age vs Survived

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(x='Age', data=train, kde=True)
plt.title('Age distribution')
plt.show()

In the plot above, we can see that passengers are across all age groups.

In [None]:
# See age distribution for Survived vs non-survived
plt.figure(figsize=(16,6))
sns.histplot(x='Age', hue='Survived', data=train, kde=True)
plt.title('Age distribution with Survived information')
plt.show()

### Fare vs Survived

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(x='Fare', hue='Survived', data=train, kde=True)
plt.title('Fare distribution')
plt.show()

## Categorical columns

In [None]:
# Categorical columns
cat_cols = [col for col in feature_cols if col not in cont_cols]
cat_cols

### Pclass vs Survived

In [None]:
plt.figure(figsize=(14, 6))
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.show()

Pclass=1(or 2) has more survivors than in Pclass=3

### Sex vs Survived

In [None]:
plt.figure(figsize=(14, 6))
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title('Gender counts with Survived information')
plt.show()

Females were given preference than Males

### SibSp vs Survived

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(x='SibSp', hue='Survived', data=train)
plt.title('SibSp counts with Survived information')
plt.show()

### Parch vs Survived

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Parch', hue='Survived', data=train)
plt.title('Parch counts with Survived information')
plt.show()

### Embarked vs Survived

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Embarked', hue='Survived', data=train)
plt.title('Embarked counts with Survived information')
plt.show()

Embarked = S was given less preference than C or Q

## Handle missing values

In [None]:
train.info()

In [None]:
missing_cols = ['Age', 'Fare', 'Embarked']

### Handle 'Embarked' column missing values

In [None]:
train['Embarked'] = train['Embarked'].fillna('N')
test['Embarked'] = test['Embarked'].fillna('N')

In [None]:
train.info()

In [None]:
train['Embarked'].value_counts()

### Handle Age and Fare missing values

In [None]:
train[['Age','Fare']].describe()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)
train[['Age', 'Fare']] = imputer.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = imputer.transform(test[['Age', 'Fare']])

In [None]:
train.info()

In [None]:
test.info()

## Feature engineering

# Create bins for `Age`, `Fare`

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(x='Age', data=train, kde=True)
plt.show()

In [None]:
# (-2, 0]    => Missing
# (0, 15]    => Child
# (15, 30]   => Person (Couldn't think of a good name :)  )
# (30, 45]   => Adult
# (45, 60]   => Senior Adult
# (60, 100]  => Senior citizen
age_bins = [-2, 0, 15, 30, 45, 60, 100]
age_labels = ['Missing','Child','Person','Adult','Senior Adult', 'Senior citizen']
train['Age_binned'] = pd.cut(train['Age'], bins=age_bins, labels=age_labels)
test['Age_binned'] = pd.cut(test['Age'], bins=age_bins, labels=age_labels)

In [None]:
train['Age_binned'].value_counts()

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Age_binned', hue='Survived', data=train)
plt.title('Train data: Age bins vs survived')
plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Age_binned', data=test)
plt.title('Test data: Age bins')
plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(x='Fare', data=train, kde=True)
plt.title('Fare distribution')
plt.show()

In [None]:
train['Fare'].describe()

In [None]:
# Bins for Fare
# (-2, 0]   => Missing
# (0, 10]   => Lt25 (Less than 25 %ile)
# (10, 24]  => Lt50 (Less than 50 %ile)
# (24, 33]  => Lt75 (Less than 75 %ile)
# (33, 800] => Ab75 (Above 75 %ile)
fare_bins = [-2, 0, 10, 24, 33, 800]
fare_labels = ['Missing', 'Lt25', 'Lt50', 'Lt75', 'Ab75']
train['Fare_binned'] = pd.cut(train['Fare'], bins=fare_bins, labels=fare_labels)
test['Fare_binned'] = pd.cut(test['Fare'], bins=fare_bins, labels = fare_labels)

In [None]:
train['Fare_binned'].value_counts()

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Fare_binned', hue='Survived', data=train)
plt.title('Train data: Fare bins vs Survived')
plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='Fare_binned', data=test)
plt.title('Test data: Fare bins')
plt.show()

Let's check our top rows for train data.

In [None]:
train.head(10)

Let's add a new computed column called 'Family', where Family = SibSp + Parch + 1

In [None]:
train['Family'] = train['SibSp'] + train['Parch'] + 1
test['Family'] = test['SibSp'] + test['Parch'] + 1

In [None]:
train.head(10)

In [None]:
feature_final_cols = ['Pclass', 'Sex', 'Embarked', 'Age_binned', 'Fare_binned', 'Family']
target_col = ['Survived']

## Model pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
cat_cols = ['Sex', 'Embarked']

In [None]:
label_cols = ['Pclass', 'Age_binned', 'Fare_binned', 'Family']

In [None]:
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
label_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = ColumnTransformer(transformers=[
    ('label', label_transformer, label_cols),
    ('cat', cat_transformer, cat_cols)
])

In [None]:
X = train[feature_final_cols]
y = train.Survived

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
X_train.head()

## Model: LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model_lr = LogisticRegression(random_state=41)
model_lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_lr)
])

In [None]:
model_lr_pipeline.fit(X_train, y_train)

In [None]:
y_valid_preds = model_lr_pipeline.predict(X_val)

In [None]:
model_lr_acc = accuracy_score(y_val, y_valid_preds)

In [None]:
print(f'LogisticRegression: Acc={model_lr_acc}')

In [None]:
def prepare_submission(pipeline):
    y_test_preds = pipeline.predict(test[feature_final_cols])
    output = pd.DataFrame({'PassengerId':test.index, 'Survived':y_test_preds})
    output.to_csv('submission.csv', index=False)

In [None]:
prepare_submission(model_lr_pipeline)

### Model: RandomForest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_rf = RandomForestClassifier(random_state=41)
model_rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_rf)
])

In [None]:
model_rf_pipeline.fit(X_train, y_train)

In [None]:
y_val_preds = model_rf_pipeline.predict(X_val)

In [None]:
model_rf_acc = accuracy_score(y_val, y_val_preds)

print(f'RandomForest: Acc={model_rf_acc}')