## Import Libraries ##

In [None]:
# Data Analysis
import numpy as np 
import pandas as pd 

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
from matplotlib import ticker
from matplotlib.ticker import PercentFormatter

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Input file
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Load Data ##

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
print('shape of train_df : ', train_df.shape)
print('shape of test_df : ', test_df.shape)

In [None]:
train_df.head()

In [None]:
train_df.info()

## Data Analysis ##

In [None]:
plt.style.use('fivethirtyeight')
f, ax = plt.subplots(2, 3, figsize=(18,12))
color_map = ['#FF6347', '#505050']
sns.set_palette(sns.color_palette(color_map))

# Sex
sex_train = pd.DataFrame(train_df['Sex'].value_counts()).reset_index(drop=False)
sex_train['source'] = 'Train'
sex_test = pd.DataFrame(test_df['Sex'].value_counts()).reset_index(drop=False)
sex_test['source'] = 'Test'
sex_combine = pd.concat([sex_train, sex_test])
sns.barplot(sex_combine['index'], sex_combine['Sex'], hue=sex_combine['source'], ax=ax[0][0])
ax[0][0].set_title('Features Comparison of between train and test', color='black', ha='left', fontsize=30, weight='bold')
ax[0][0].set_ylabel('')
ax[0][0].set_xlabel('Sex')

# Pclass
class_train = pd.DataFrame(train_df['Pclass'].value_counts()).reset_index(drop=False)
class_train['source'] = 'Train'
class_test = pd.DataFrame(test_df['Pclass'].value_counts()).reset_index(drop=False)
class_test['source'] = 'Test'
class_combine = pd.concat([class_train, class_test])
sns.barplot(class_combine['index'], class_combine['Pclass'], hue=class_combine['source'], ax=ax[0][1])
ax[0][1].legend().remove()
ax[0][1].set_ylabel('')
ax[0][1].set_xlabel('Pclass')

# Age
sns.kdeplot(train_df['Age'], color='#FF6347', shade=False, ax=ax[0][2])
sns.kdeplot(test_df['Age'], color='#505050', shade=False, ax=ax[0][2])
ax[0][2].set_ylabel('')
ax[0][2].set_xlabel('Age')

# Family
train_df['Family'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family'] = test_df['SibSp'] + test_df['Parch'] + 1
family_train = pd.DataFrame((train_df['Family']).value_counts()).reset_index(drop=False)
family_train['source'] = 'Train'
family_test = pd.DataFrame((test_df['Family']).value_counts()).reset_index(drop=False)
family_test['source'] = 'Test'
family_combine = pd.concat([family_train, family_test])
sns.barplot(family_combine['index'], family_combine['Family'], hue=family_combine['source'], ax=ax[1][0])
ax[1][0].legend().remove()
ax[1][0].set_ylabel('')
ax[1][0].set_xlabel('Family')

# Embarked
em_train = pd.DataFrame(train_df['Embarked'].value_counts()).reset_index(drop=False)
em_train['source'] = 'Train'
em_test = pd.DataFrame(test_df['Embarked'].value_counts()).reset_index(drop=False)
em_test['source'] = 'Test'
em_combine = pd.concat([em_train, em_test])
sns.barplot(em_combine['index'], em_combine['Embarked'], hue=em_combine['source'], ax=ax[1][1])
ax[1][1].legend().remove()
ax[1][1].set_ylabel('')
ax[1][1].set_xlabel('Embakred')

# Fare
sns.kdeplot(train_df['Fare'], color='#FF6347', ax=ax[1][2])
sns.kdeplot(test_df['Fare'], color='#505050', ax=ax[1][2])
ax[1][2].set_ylabel('')
ax[1][2].set_xlabel('Fare')

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18,9))

plt.style.use('fivethirtyeight')
colormap=['#FF6347','white']

sns.scatterplot(x='Age', y='Fare', data=train_df.loc[train_df['Survived'] == 0, :], ax=ax[0], color='#505050')
sns.scatterplot(x='Age', y='Fare', data=train_df.loc[train_df['Survived'] == 1, :], ax=ax[1], color='#FF6347')
ax[0].set_title('Correlation between Fare and Age (Dead)', fontsize=20)
ax[1].set_title('Correlation between Fare and Age (Survived)', fontsize=20)
ax[0].set_ylabel('Fare', fontsize=15)
ax[0].set_xlabel('Age', fontsize=15)
ax[1].set_ylabel('Fare', fontsize=15)
ax[1].set_xlabel('Age', fontsize=15)

## Checking and Filling NaN Values ##

In [None]:
plt.style.use('fivethirtyeight')
f, ax = plt.subplots(1, 2, figsize=(18, 9))

train_df.isnull().sum().plot.barh(color='#505050', fontsize=15, ax=ax[0])
for i, v in enumerate(list(train_df.isnull().sum())):
    ax[0].text(v+500, i, str(v))
ax[0].set_title('Count of NaN values in each columns', fontsize=20)

msno.matrix(train_df, color=(0.314, 0.314, 0.314), ax=ax[1], fontsize=15)
ax[1].set_title('Frequency of NaN values in each columns', fontsize=20)
ax[1].xaxis.tick_bottom()
list_col = list(train_df.columns)
ax[1].set_xticklabels(list_col, rotation=45, ha='right', fontsize=15)

In [None]:
# Sex & Pclass
Pclass_Sex_train = pd.DataFrame(columns=['Index', 'Survived', 'Dead', 'Survival rate', 'Age', 'Fare'])
train_df['Sex & Pclass'] = 0
test_df['Sex & Pclass'] = 0

n = 0
for i in list(train_df['Sex'].unique()):
    for j in range(1, 4):
        train_mj = train_df.loc[(train_df['Sex'] == i) & (train_df['Pclass'] == j), :]
        Pclass_Sex_train.loc[n] = ['Sex : {} & Pclass : {}'.format(i, j), 
        train_mj['Survived'].value_counts()[1], train_mj['Survived'].value_counts()[0],
        round(train_mj['Survived'].value_counts()[1] / (train_mj['Survived'].value_counts()[1] + train_mj['Survived'].value_counts()[0]), 2),
        round(train_mj['Age'].mean(), 0), train_mj['Fare'].mean()]

        train_df.loc[(train_df['Sex'] == i) & (train_df['Pclass'] == j), 'Sex & Pclass'] = n
        train_df.loc[(train_df['Age'].isnull()) &(train_df['Sex'] == i) & (train_df['Pclass'] == j), 'Age'] = round(train_mj['Age'].mean(), 0)
        test_df.loc[(test_df['Age'].isnull()) &(test_df['Sex'] == i) & (test_df['Pclass'] == j), 'Age'] = round(train_mj['Age'].mean(), 0)
        train_df.loc[(train_df['Fare'].isnull()) &(train_df['Sex'] == i) & (train_df['Pclass'] == j), 'Fare'] = round(train_mj['Fare'].mean(), 0)
        test_df.loc[(test_df['Fare'].isnull()) &(test_df['Sex'] == i) & (test_df['Pclass'] == j), 'Fare'] = round(train_mj['Fare'].mean(), 0)
        
        n += 1

Pclass_Sex_train

In [None]:
train_co = train_df.copy()
test_co = test_df.copy()
combine = [train_co, test_co]

train_df['Age_Band'] = pd.cut(train_df['Age'], 4)
train_df[['Survived', 'Age_Band']].groupby(train_df['Age_Band'], as_index=True).mean(). \
sort_values(by='Age_Band', ascending=True)

In [None]:
for data in combine:
    # Sex
    data['Sex'] = data['Sex'].map({'male' : 0, 'female' : 1}).astype(int)
    
    # Embarked
    data['Embarked'] = data['Embarked'].fillna('S')
    data['Embarked'] = data['Embarked'].map({'C' : 0, 'Q' : 1, 'S' : 2}).astype(int)
    
    # Age
    data.loc[data['Age'] <= 21.81, 'Age'] = 0
    data.loc[(data['Age'] > 21.81) & (data['Age'] <= 43.54), 'Age'] = 1
    data.loc[(data['Age'] > 43.54) & (data['Age'] <= 65.27), 'Age'] = 2
    data.loc[data['Age'] > 65.27, 'Age'] = 3
    data['Age'] = data['Age'].astype(int)

train_co.head()

In [None]:
train_corr = train_co[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family', 'Sex & Pclass']]
fig = plt.figure(figsize=(18,8))
sns.heatmap(train_corr.corr(), linewidth=1, vmax=1.0, square=True, linecolor='white', annot=True,
           annot_kws={'size':16}, fmt='.2f', cmap='gray')
plt.title('Pearson correlation of Feature')

## Machine Learning ##

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train = train_co.drop(['Survived', 'PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Family', 'Age', 'Fare', 'Sex & Pclass'], axis=1)
X_target = train_co['Survived']
X_test = test_co.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Family', 'Age', 'Fare', 'Sex & Pclass'], axis=1)
X_tr, X_vid, y_tr, y_vid = train_test_split(X_train, X_target, test_size=0.25, random_state=0)

In [None]:
# RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_tr, y_tr)
score_RFC = model.score(X_tr, y_tr)
prediction_RFC = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_RFC))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_RFC, y_vid)))

In [None]:
# LinearSVC
model = LinearSVC()
model.fit(X_tr, y_tr)
score_lSVC = model.score(X_tr, y_tr)
prediction_lSVC = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_lSVC))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_lSVC, y_vid)))

In [None]:
# DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_tr, y_tr)
score_DTC = model.score(X_tr, y_tr)
prediction_DTC = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_DTC))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_DTC, y_vid)))

In [None]:
# GaussianNB
model = GaussianNB()
model.fit(X_tr, y_tr)
score_GNB = model.score(X_tr, y_tr)
prediction_GNB = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_GNB))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_GNB, y_vid)))

In [None]:
# Perceptron
model = Perceptron()
model.fit(X_tr, y_tr)
score_PT = model.score(X_tr, y_tr)
prediction_PT = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_PT))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_PT, y_vid)))

In [None]:
# SGDClassifier
model = SGDClassifier()
model.fit(X_tr, y_tr)
score_SGD = model.score(X_tr, y_tr)
prediction_SGD = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_SGD))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_SGD, y_vid)))

In [None]:
# LogisticRegression
model = LogisticRegression()
model.fit(X_tr, y_tr)
score_LR = model.score(X_tr, y_tr)
prediction_LR = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_LR))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_LR, y_vid)))

In [None]:
# KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_tr, y_tr)
score_KNC = model.score(X_tr, y_tr)
prediction_KNC = model.predict(X_vid)
print('Train Accuracy Percentage : {:.2f}'.format(100 * score_KNC))
print('Test Accuracy Percentage : {:.2f}'.format(100 * metrics.accuracy_score(prediction_KNC, y_vid)))

In [None]:
model = DecisionTreeClassifier()
model.fit(X_tr, y_tr)
prediction = model.predict(X_test)
submission['Survived'] = prediction
submission.to_csv('TPS Apr 2021_MJJO_version5', index=False)