![](http://imgbntnews.hankyung.com/bntdata/images/photo/201802/46e751857d79621b9b2c0422b13c57d1.jpg)

***It's a kernel for beginners who are first introduced.***

***It's based on the existing kernels and tried to explain them as easily as possible.***

***This kernel can achieve the top 28 percent and I hope it will help beginners a lot.***

# Data Check

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)

import missingno as msno

#ignore warnings
import warnings
warnings.filterwarnings('ignore') 

%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

# separate dataset

In [None]:
df_train.head(10)

In [None]:
df_train.describe()
# statistical figures representation of data of train data

In [None]:
df_train.columns
# attribute of each column

In [None]:
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)
    
    # The process of identifying the missing value of each column

In [None]:
msno.matrix(df=df_train.iloc[:, :], figsize=(8,8), color=(0.1, 0.6, 0.8))

# msno.matrix creates the same matrix as shown below 
# empty space NULL data 

In [None]:
msno.bar(df=df_train.iloc[:, :], figsize=(8,8), color=(0.1, 0.6, 0.8))

# It makes with the graph of the bar type.

In [None]:
f, ax = plt.subplots(1,2, figsize = (18,8))

df_train['Survived'].value_counts().plot.pie(explode = [0, 0.1], autopct = '%1.1f%%', ax=ax[0], shadow = True)
# It draws a series-type pieplot. 
ax[0].set_title('Pie plot - Survived')
# Set the title for the first plot
ax[0].set_ylabel('')
# Set the ylabel for the first plot
sns.countplot('Survived', data = df_train, ax=ax[1])
#  Draw a count plot.
ax[1].set_title('Count plot - Survived')
# Set the title for the count plot
plt.show()

# The ratio of survival 0 to 1 of the train set is shown graphically.

# EDA

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index = True).count()

# Tie with groupby and count how many counts.

In [None]:
pd.crosstab(df_train['Pclass'], df_train['Survived'], margins = True).style.background_gradient(cmap='Pastel1')
# margin show total

In [None]:
df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index = True).mean().sort_values(by='Survived', ascending = False).plot.bar()

# It means Survival rate

In [None]:
y_position = 1.02
f, ax = plt.subplots(1, 2, figsize= (18,8))
df_train["Pclass"].value_counts().plot.bar(color = ["#CD7F32", "#FFDF00", "#D3D3D3"], ax = ax[0])
ax[0].set_title("Number of passengers By Pclass")
ax[0].set_ylabel("Count")
sns.countplot("Pclass", hue = "Survived", data = df_train, ax = ax[1])
ax[1].set_title("Pclass: Survived vs Dead", y = y_position)
plt.show()
     
# The number of passengers and the survival rate according to the Passenger Class can be known.3 class (I think it is economy) was the most on board, and FirstClass passengers had the highest survival rate.

In [None]:
print("the oldest passenger : {:.1f} years".format(df_train["Age"].max()))
print("the youngest passenger : {:.1f} years".format(df_train["Age"].min()))
print("average of passengers age : {:.1f} years".format(df_train["Age"].mean()))

In [None]:
fix, ax = plt.subplots(1, 1, figsize = (9, 5))
sns.kdeplot(df_train[df_train["Survived"] == 1]["Age"], ax=ax)
sns.kdeplot(df_train[df_train["Survived"] == 0]["Age"], ax=ax)    
plt.legend(["Survived == 1", "Survived == 0"])
plt.show()

# kdeplot is used to estimate the distribution of data.

In [None]:
fix, ax = plt.subplots(1, 1, figsize = (9, 7))
sns.kdeplot(df_train[df_train["Pclass"] == 1]["Age"], ax=ax)
sns.kdeplot(df_train[df_train["Pclass"] == 2]["Age"], ax=ax)
sns.kdeplot(df_train[df_train["Pclass"] == 3]["Age"], ax=ax)
plt.xlabel("Age")
plt.title("Age Distribution within classes")
plt.legend(["1st Class", "2nd Class", "3rd Class"])
plt.show()                       

# In this situation, if you use histogram, you can use kde because you can not overlap
# The Age distribution according to the Class can be known.

In [None]:
fig, ax  = plt.subplots(1, 1, figsize = (9, 5))
sns.kdeplot(df_train[(df_train["Survived"] == 0) & (df_train["Pclass"] == 1)]["Age"], ax=ax)
sns.kdeplot(df_train[(df_train["Survived"] == 1) & (df_train["Pclass"] == 1)]["Age"], ax=ax)
plt.legend(["Survived == 0", "Survived == 1"])
plt.title("1st Class")
plt.show()

# Age distribution of non-survival people with first class
# Age distribution of survival people with first class

In [None]:
fig, ax  = plt.subplots(1, 1, figsize = (9, 5))
sns.kdeplot(df_train[(df_train["Survived"] == 0) & (df_train["Pclass"] == 2)]["Age"], ax=ax)
sns.kdeplot(df_train[(df_train["Survived"] == 1) & (df_train["Pclass"] == 2)]["Age"], ax=ax)
plt.legend(["Survived == 0", "Survived == 1"])
plt.title("2nd Class")
plt.show()

# Age distribution of non-survival people with second class
# Age distribution of survival people with second class

In [None]:
fig, ax  = plt.subplots(1, 1, figsize = (9, 5))
sns.kdeplot(df_train[(df_train["Survived"] == 0) & (df_train["Pclass"] == 3)]["Age"], ax=ax)
sns.kdeplot(df_train[(df_train["Survived"] == 1) & (df_train["Pclass"] == 3)]["Age"], ax=ax)
plt.legend(["Survived == 0", "Survived == 1"])
plt.title("3rd Class")
plt.show()

# Age distribution of non-survival people with third class
# Age distribution of survival people with third class

In [None]:
chage_age_range_survival_ratio = []
i = 80
for i in range(1,81):
    chage_age_range_survival_ratio.append(df_train[df_train["Age"] < i]["Survived"].sum()/len(df_train[df_train["Age"] < i]["Survived"])) # i보다 작은 나이의 사람들이 생존률

plt.figure(figsize = (7, 7))
plt.plot(chage_age_range_survival_ratio)
plt.title("Survival rate change depending on range of Age", y = 1.02)
plt.ylabel("Survival rate")
plt.xlabel("Range of Age(0-x)")
plt.show()
    
# The younger the age, the higher the probability of survival, The older the age, the less the probability of survival.

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
sns.violinplot("Pclass","Age", hue = "Survived", data = df_train, scale = "count", split = True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0, 110, 10))

sns.violinplot("Sex", "Age", hue = "Survived", data = df_train, scale = "count", split = True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0, 110, 10))

plt.show()

# Based on age, the survival rate according to Pclass and the survival rate according to gender can be seen at a glance.
# As a result, the better the Pclass, the higher the survival rate and the higher the survival rate of women than men.

In [None]:
f, ax = plt.subplots(1, 1, figsize=(7,7))
df_train[["Embarked","Survived"]].groupby(["Embarked"], as_index=True).mean().sort_values(by="Survived",
                                                                                         ascending = False).plot.bar(ax=ax)

In [None]:
f, ax = plt.subplots(2, 2, figsize=(20,15))
sns.countplot("Embarked", data = df_train, ax=ax[0,0])
ax[0,0].set_title("(1) No. of Passengers Boared")

sns.countplot("Embarked", hue = "Sex", data = df_train, ax=ax[0,1])
ax[0,1].set_title("(2) Male-Female split for Embarked")

sns.countplot("Embarked", hue = "Survived", data = df_train, ax=ax[1,0])
ax[1,0].set_title("(3) Embarked vs Survived")

sns.countplot("Embarked", hue = "Pclass", data = df_train, ax=ax[1,1])
ax[1,1].set_title("(4) Embarked vs Pclass")

plt.subplots_adjust(wspace = 0.4, hspace = 0.5) 
plt.show()

# As a result, the survival rate is high because the people on board C have a lot of first class and many women.

In [None]:
df_train["FamilySize"] = df_train["SibSp"] + df_train["Parch"]+1
df_test["FamilySize"] = df_test["SibSp"] + df_test["Parch"]+1

# Create a new feature, "FamilySize".

In [None]:
df_train["FamilySize"].head(5)

In [None]:
print("Maximum size of Family: ", df_train["FamilySize"].max())
print("Minimum size of Family: ", df_train["FamilySize"].min())

In [None]:
f, ax = plt.subplots(1, 3, figsize = (40, 10))
sns.countplot("FamilySize", data = df_train, ax = ax[0])
ax[0].set_title("(1) No. of Passenger Boarded", y = 1.02)

sns.countplot("FamilySize", hue = "Survived", data = df_train, ax = ax[1])
ax[1].set_title("(2) Survived countplot depending of FamilySize")

df_train[["FamilySize", "Survived"]].groupby(["FamilySize"], as_index = True).mean().sort_values(by = "Survived",
                                                                                                      ascending = False).plot.bar(ax = ax[2])
ax[2].set_title("(3) Survived rate depending on FamilySize", y = 1.02)

plt.subplots_adjust(wspace = 0.2, hspace = 0.5)
plt.show()

# The first plot is the number of passengers according to the number of family members (1 to 11), the second plot is the number of survivors according to the number of family members, and the third plot is the survival rate according to the number of family members.
# The family with four families has the highest survival rate.

In [None]:
f, ax = plt.subplots(1, 1, figsize = (8,8))
g = sns.distplot(df_train["Fare"], color = "b", label="Skewness: {:2f}".format(df_train["Fare"].skew()), ax=ax)
g = g.legend(loc = "best")

# The skewness tells us how asymmetric the distribution is.

In [None]:
# Log to get rid of the skewness.

df_train["Fare"] = df_train["Fare"].map(lambda i:np.log(i) if i>0 else 0)

In [None]:
df_train["Fare"].head()

In [None]:
f, ax = plt.subplots(1, 1, figsize = (8,8))
g = sns.distplot(df_train["Fare"], color = "b", label="Skewness: {:2f}".format(df_train["Fare"].skew()), ax=ax)
g = g.legend(loc = "best")

# normal approximation

# Feature Engineering

In [None]:
# First, fill NULL data.

df_train["Age"].isnull().sum()

In [None]:
# train, test Two data sets are combined and statistics is confirmed.
# (Use concat: A function that builds a dataset on a dataset)
df_all = pd.concat([df_train,df_test])
df_all.shape

In [None]:
df_train.shape

# Of the 891 rows, only two have no missing value, so replace it with the most frequent value.

In [None]:
df_train["Embarked"].fillna("S", inplace = True)

# fillna fills the missing value value with the designated value.
# In the EDA process, S is the most common, so it replaces.

In [None]:
df_train["Age_Categ"] = 0
df_test["Age_Categ"] = 0

# create a new feature

In [None]:
def category_age(x):
    if x < 10:
        return 0
    elif x < 20:
        return 1
    elif x < 30:
        return 2
    elif x < 40:
        return 3 
    elif x < 50:
        return 4
    elif x < 60: 
        return 5
    elif x < 70: 
        return 6
    else:
        return 7
    
# Make a function for apply use. 

In [None]:
df_train["Age_Categ"] = df_train["Age"].apply(category_age)
df_test["Age_Categ"] = df_test["Age"].apply(category_age)

# By using the apply function, the information of the age column categorized is added to train and test set.

In [None]:
# Since categorizing Age, the unnecessary Age column is deleted.

df_train.drop(["Age"], axis = 1 ,inplace = True)
df_test.drop(["Age"], axis = 1, inplace = True)

In [None]:
df_train["Embarked"].value_counts()

In [None]:
df_train["Embarked"] = df_train["Embarked"].map({"C" : 0, "Q" : 1, "S" : 2})
df_test["Embarked"] = df_test["Embarked"].map({"C" : 0, "Q" : 1, "S" : 2})

In [None]:
df_train["Sex"].unique()

In [None]:
df_train["Sex"] = df_train["Sex"].map({"female" : 0, "male" : 1})
df_test["Sex"] = df_test["Sex"].map({"female" : 0, "male" : 1})

In [None]:
heatmap_data = df_train[["Survived", "Pclass", "Sex", "Fare", "Embarked", "FamilySize", "Age_Categ"]]

In [None]:
colormap = plt.cm.PuBu
plt.figure(figsize=(10, 8))
plt.title("Person Correlation of Features", y = 1.05, size = 15)
sns.heatmap(heatmap_data.astype(float).corr(), linewidths = 0.1, vmax = 1.0,
           square = True, cmap = colormap, linecolor = "white", annot = True, annot_kws = {"size" : 16})


# Correlation coefficient analysis shows whether there are overlapping features and which features show correlation.

In [None]:
df_train = pd.get_dummies(df_train, columns = ["Embarked"], prefix = "Embarked")
df_test = pd.get_dummies(df_test, columns = ["Embarked"], prefix = "Embarked")

In [None]:
df_train.drop(["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin"], axis = 1, inplace = True)
df_test.drop(["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin"], axis = 1, inplace = True)

# Ensemble

In [None]:
kfold = StratifiedKFold(n_splits=10)

In [None]:
df_train["Survived"] = df_train["Survived"].astype(int)

Y_train = df_train["Survived"]

X_train = df_train.drop(labels = ["Survived"],axis = 1)

In [None]:
# ExtraTrees

ExtC = ExtraTreesClassifier()

# Search Grid for Optimal Parameters

ex_param_grid = {"max_depth": [None],
                "max_features": [1,2,10],
                "min_samples_split": [2, 3, 10],
                "min_samples_leaf": [1,3,10],
                "bootstrap": [False],
                "n_estimators": [100, 300],
                "criterion": ["gini"]}

gsExtC = GridSearchCV(ExtC, param_grid = ex_param_grid, cv = kfold, scoring = "accuracy",
                     n_jobs = 4, verbose = 1)

gsExtC.fit(X_train, Y_train)
ExtC_best = gsExtC.best_estimator_

gsExtC.best_score_

In [None]:
# Grid Search Optimization for Five Models
    
# LightGBM
LGBM = LGBMClassifier()

lgbm_param_grid = {"max_depth" : [40, 50, 60],
                 "min_child_samples": [10, 20],
                  "num_leaves" : [20, 30],
                 "n_estimators": [500,1000],
                 "learning_rate": [0.01, 0.1, 0.2, 0.3]}

gsLGBMC = GridSearchCV(LGBM,param_grid = lgbm_param_grid, cv = kfold, scoring = "accuracy",
                       n_jobs = 4, verbose = 1)
gsLGBMC.fit(X_train, Y_train)
lgbm_best = gsLGBMC.best_estimator_

gsLGBMC.best_score_

In [None]:
# XGBoost

XGB = XGBClassifier()
xgbc_param_grid={
            'silent':[True],
            'max_depth':[6,8],
            'min_child_weight':[3,5],
            'gamma':[0,1,2],
            'n_estimators':[100, 300]}

gsXGBC = GridSearchCV(XGB,param_grid = xgbc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsXGBC.fit(X_train,Y_train)

XGBC_best = gsXGBC.best_estimator_

# Best score
gsXGBC.best_score_

In [None]:
# RandomForestClassifier
RFC = RandomForestClassifier()

# Search Grid for Optimal Parameters
rf_param_grid = {"max_depth": [None],
                "max_features": [3,10],
                "min_samples_split": [3,10],
                "min_samples_leaf": [2,10],
                "n_estimators": [100,300],
                "criterion": ["gini"]}

gsRFC = GridSearchCV(RFC, param_grid = rf_param_grid, cv=kfold, scoring = "accuracy", n_jobs = 4,
                    verbose = 1)

gsRFC.fit(X_train,Y_train)
RFC_best = gsRFC.best_estimator_

gsRFC.best_score_

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


g = plot_learning_curve(gsExtC.best_estimator_,"ExtraTrees learning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsLGBMC.best_estimator_,"LGBM mearning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsXGBC.best_estimator_,"XGB mearning curves",X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",X_train,Y_train,cv=kfold)

In [None]:
votingC = VotingClassifier(estimators = [("rfc", RFC_best), ("extc", ExtC_best),
                                        ("lgbm", lgbm_best), ("xgb", XGBC_best)], voting = "soft", n_jobs = 4)

votingC = votingC.fit(X_train, Y_train)

# Submission

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
df_test["Fare"].fillna("35.6271", inplace = True)
X_test = df_test.values

In [None]:
prediction = votingC.predict(X_test)

In [None]:
submission["Survived"] = prediction

In [None]:
submission.to_csv("./The_first_submission.csv", index = False)