In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from xgboost import XGBClassifier

## Import and Examine Data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
df_copy = train.copy()
df_copy = pd.get_dummies(df_copy, columns=['Embarked'])
df_copy.head(10)

## Preprocessing Data

In [None]:
def fill_na_values(df):
    
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    df['hasCabin'] = df['Cabin'].apply(lambda x: 1 if type(x) == str else 0)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'])
    print(df.isnull().sum())
    print(df.head())
    
    return df


In [None]:
train = fill_na_values(train)
test = fill_na_values(test)

In [None]:
train.head()

In [None]:
features = [col for col in train.columns if col not in ['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin']]
features

## Data Analysis

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=train)

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Sex', data=train)
plt.xticks([0,1],['Male', 'Female'])

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(data=train[['Survived', 'Sex']].corr(), annot=True, alpha=0.6)

In [None]:
# feature correlations
plt.figure(figsize=(18, 14))
sns.heatmap(train[features + ['Survived']].corr(), annot=True, alpha=0.8)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='hasCabin', data=train)

In [None]:
fig, axes = plt.subplots(1,3, sharey=True, figsize=(16,8))

sns.countplot(ax=axes[0], x='Embarked_C', data=train)

sns.countplot(ax=axes[1], x='Embarked_Q', data=train)
axes[1].set_ylabel('')
axes[1].spines['left'].set_visible(False)

sns.countplot(ax=axes[2], x='Embarked_S', data=train)
axes[2].set_ylabel('')

plt.suptitle('Embarked')
sns.despine(top=True, right=True)