In [None]:
import numpy as np
import pandas as pd
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots as mk_sp
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as pyo
pyo.init_notebook_mode()

import warnings
warnings.filterwarnings("ignore")

plt.rc('figure', figsize=(18,9))

<h1 style="background-color:azure; text-align:center; font-size:300%">1. Loading Data...</h1>

In [None]:
train=pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

train["Survived"]=train["Survived"].apply(lambda x:"survived" if x in [1] else "dead")

<h2 style="background-color:azure; text-align:center; font-size:300%">2. Exploratory Data Analysis</h2>

In [None]:
labels=["survived", "dead"]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print("shape of train data: {} x {}".format(len(train), train.shape[1]))
print("shape of test data: {} x {}".format(len(test), test.shape[1]))

In [None]:
print("Summary of train data\n")
print(train.info())

In [None]:
print("Summary of test data\n")
print(test.info())

In [None]:
print("Count of null values in train data:\n")
print(train.isnull().sum())

print("\n\nNaN rows is: {}".format(sum(train.isnull().sum())))

In [None]:
print("Count of null values in test data:\n")
print(test.isnull().sum())


print("\n\nNaN rows is: {}".format(sum(test.isnull().sum())))

In [None]:
print("Count of unique values in train data:\n")
train[['SibSp', 'Parch', 'Embarked', 'Pclass']].nunique()

In [None]:
print("Count of unique values in test data:\n")
test[['SibSp', 'Parch', 'Embarked', 'Pclass']].nunique()

In [None]:
print("Train data:\n")
for col in ['SibSp', 'Parch']:
    print("Unique value in {} is {}".format(col, sorted(train[col].unique())))

In [None]:
print("Test data:\n")
for col in ['SibSp', 'Parch']:
    print("Unique value in {} is {}".format(col, sorted(test[col].unique())))

In [None]:
train['Cabin'].fillna('X', inplace=True)
train['Cabin']=[list(word)[0] for word in train['Cabin'].values]

test['Cabin'].fillna('X', inplace=True)
test['Cabin']=[list(word)[0] for word in test['Cabin'].values]

print("Train: Unique elements in Cabin: ", sorted(train['Cabin'].unique()))

print("\n Train: Count of unique elements in Cabin:", train['Cabin'].value_counts())


print("Test: Unique elements in Cabin: ", sorted(test['Cabin'].unique()))

print("\n Test: Count of unique elements in Cabin:", test['Cabin'].value_counts())

<span style='background:#5CB3FF; font-size:150%'>**Observations**</span>
* Train
    * Shape: 100000 x 12
    * Dtypes: float64(2), int64(5), object(5), memory usage: 9.2+ MB
    * Nan rows: 76165
    * Sibsp, Parch, Pclass are ordinal variables.
    * Embarked, sex, name are nominal vairables.
    * Fare, age are numberical variables.
    * Cabin has most number of Nan values: 67866
    * Survived is our target variable with values 0(No) & 1(Yes).
* Test
    * Shape: 100000 x 11
    * Dtypes: float64(2), int64(4), object(5) memory usage: 8.4+ MB
    * Nan rows: 79909
    * Sibsp, Parch, Pclass are ordinal variables.
    * Embarked, sex, name are nominal vairables.
    * Fare, age are numberical variables.
    * Cabin has most number of Nan values: 70831
* Train & test set has same categories for Sibsp & Parch resp.


<h3 style="background-color:azure; text-align:center; font-size:200%">2.1. Target variable</h3>

In [None]:
fig, ax=plt.subplots(ncols=2, facecolor='#f6f5f5')
ax[0].pie(train.groupby(['Survived'])['Survived'].count(), explode=(0, 0.1), labels=['dead','survived'], autopct='%1.1f%%', shadow=True)
sns.histplot(train, x="Survived", ax=ax[1])
ax[0].set_title('Survived')
ax[1].set_title('Survived')
ax[1].set_xticks([0,1])
fig.suptitle('Distribution of survived & dead', fontsize=20)
fig.show()

<span style='background:#5CB3FF; font-size:150%'>**Observations**</span>
* 42.8% of total people were saved.

<h3 style="background-color:azure; text-align:center; font-size:200%">2.2. Continuous variable</h3>

In [None]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
print("Distribution of train data: \n")
train.describe()

In [None]:
print("Distribution of test data: \n")
test.describe()

In [None]:
train['species']='train'
test['species']='test'
df=train.append(test, ignore_index=True)

<span style='background:orange; font-size:150%'>**Age**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Age", hue="species", ax=ax, element='poly')
fig.suptitle('Distribution of train & test set of Age', fontsize=20)
fig.show()

In [None]:
lst=[""]*len(train)
for (i,age) in enumerate(train['Age'].values):
    if age<=5:
        lst[i]="0-5"
    elif age>5 and age<=10:
        lst[i]="5-10"
    elif age>10 and age<=15:
        lst[i]="10-15"
    elif age>15 and age<=20:
        lst[i]="15-20"
    elif age>20 and age<=25:
        lst[i]="20-25"
    elif age>25 and age<=30:
        lst[i]="25-30"
    elif age>30 and age<=35:
        lst[i]="30-35"
    elif age>35 and age<=40:
        lst[i]="35-40"
    elif age>40 and age<=45:
        lst[i]="40-45"
    elif age>45 and age<=50:
        lst[i]="45-50"
    elif age>50 and age<=55:
        lst[i]="50-55"
    elif age>55 and age<=60:
        lst[i]="55-60"
    elif age>60 and age<=65:
        lst[i]="60-65"
    elif age>65 and age<=70:
        lst[i]="65-70"
    elif age>70 and age<=75:
        lst[i]+="70-75"
    elif age>75 and age<=80:
        lst[i]="75-80"
    elif age>80 and age<=85:
        lst[i]="80-85"
    else:
        lst[i]="85+"

In [None]:
train['agecount']=lst

In [None]:
fig, ax=plt.subplots(nrows=2, ncols=2, facecolor='#f6f5f5', figsize=(25, 15))
sns.histplot(train, x="Age", bins=16, ax=ax[0][0], hue_order=labels)
sns.histplot(train, x="Age", hue="Survived", bins=16, ax=ax[0][1], hue_order=labels)
sns.histplot(train[train['Sex']=='male'], x="Age", hue="Survived", bins=16, ax=ax[1][0], hue_order=labels)
sns.histplot(train[train['Sex']=='female'], x="Age", hue="Survived", bins=16, ax=ax[1][1], hue_order=labels)
ax[0][0].set_xticks(range(0,85,5))
ax[0][1].set_xticks(range(0,85,5))
ax[1][0].set_xticks(range(0,85,5))
ax[1][1].set_xticks(range(0,85,5))
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title('Survival')
ax[1][0].set_title('Survival of male')
ax[1][1].set_title('Survival of female')
fig.suptitle('Distribution of population based on age', fontsize=20)
fig.show()

In [None]:
Age=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85+','All']
pd.crosstab(train.Survived,train.agecount,margins=True)[Age].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.agecount,normalize='columns', margins=True)[Age]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.agecount,margins=True)[Age].style.background_gradient(cmap='summer_r')

In [None]:
Age=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','All']
(pd.crosstab([train[train['Sex']=='male'].Sex,train.Survived],train.agecount,normalize='columns',margins=True)[Age].round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
Age=['0-5','5-10','10-15','15-20','20-25','25-30','30-35','35-40','40-45','45-50','50-55','55-60','60-65','65-70','70-75','75-80','80-85','85+','All']
(pd.crosstab([train[train['Sex']=='female'].Sex,train.Survived],train.agecount,normalize='columns',margins=True)[Age].round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
lst=train[train['Sex']=='male'].groupby('agecount')['agecount'].count()
lst1=train[(train["Survived"]=="survived") & (train['Sex']=='male')].groupby('agecount')['agecount'].count()
lst2=train[(train["Survived"]=="dead") & (train['Sex']=='male')].groupby('agecount')['agecount'].count()
for i in range(len(lst1)):
    print("% Survival of male for age group  {:s} is : {:.2f}%".format(lst1.index[i], 100.0*lst1[i]/lst[i]))

print("\n\n")

for i in range(len(lst2)):
    print("% Dead of male for age group  {:s} is : {:.2f}%".format(lst2.index[i], 100.0*lst2[i]/lst[i]))

In [None]:
lst=train[train['Sex']=='female'].groupby('agecount')['agecount'].count()
lst1=train[(train["Survived"]=="survived") & (train['Sex']=='female')].groupby('agecount')['agecount'].count()
lst2=train[(train["Survived"]=="dead") & (train['Sex']=='female')].groupby('agecount')['agecount'].count()
for i in range(len(lst1)):
    print("% Survival of female for age group  {:s} is : {:.2f}%".format(lst1.index[i], 100.0*lst1[i]/lst[i]))

print("\n\n")

ind=0
for i in range(len(lst)):
    if lst.index[i]=='85+':
        print("% Dead of female for age group  {:s} is : {:.2f}%".format('85+', 0))
        continue
    print("% Dead of female for age group  {:s} is : {:.2f}%".format(lst2.index[ind], 100.0*lst2[ind]/lst[i]))
    ind+=1

In [None]:
df['age_log']=np.log(1+df['Age'])

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="age_log", hue="species", ax=ax, element='poly')
fig.suptitle('Distribution of train & test set of Age', fontsize=20)
fig.show()

<span style='background:orange; font-size:150%'>**Fare**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(25,9))
sns.histplot(df, x="Fare", hue="species", bins=25, ax=ax)
ax.set_xticks(range(0,750,25))
fig.suptitle('Distribution of Fare in train & test set', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=2, ncols=2, facecolor='#f6f5f5', figsize=(30,25))
sns.histplot(train,x="Fare", ax=ax[0][0], bins=25, hue_order=labels)
sns.histplot(train,x="Fare",hue="Survived", ax=ax[0][1], bins=25, hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Fare",hue="Survived",ax=ax[1][0], bins=25, hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Fare",hue="Survived",ax=ax[1][1], bins=25, hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Survival")
ax[1][0].set_title("Survival of male")
ax[1][1].set_title("Survival of female")
ax[0][0].set_xticks(range(0,750,30))
ax[0][1].set_xticks(range(0,750,30))
ax[1][0].set_xticks(range(0,750,30))
ax[1][1].set_xticks(range(0,750,30))
fig.suptitle('Distribution of population based on fare', fontsize=20)
fig.show()

In [None]:
train['fare']=pd.qcut(train['Fare'], 3, labels=['low','med','high'])

In [None]:
pd.crosstab(train.Survived,train.fare,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.fare,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex, train.Survived],train.fare,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex, train.Survived],train.fare,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex, train.Survived],train.fare,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
df['fare_log']=np.log(1+df['Fare'])

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="fare_log", hue="species", ax=ax, element='poly')
fig.suptitle('Distribution of train & test set of Fare log', fontsize=20)
fig.show()

In [None]:
train["related"]=train['SibSp']+train['Parch']

<span style='background:#5CB3FF; font-size:150%'>**Observations**</span>
* Train
    * Age- min: 0.08 and max: 87
    * Sibling/spouse- min : 0 and max: 8
    * Parent/children- min : 0 and max: 9
    * Fare- min: 0.68 and max: 744.66
* Test
    * Age- min: 0.08 and max: 80
    * Sibling/spouse- min : 0 and max: 8
    * Parent/children- min : 0 and max: 9
    * Fare- min: 0.05 and max: 680.70
* Distribution of age in train & test set is different.
* Train set has most people between 25 to 65 age group.
* Test set has most people between 15 to 40 age group.
* Survival of 30+ female and 10-15 year female is most among females.
* Survival of 30+ male and 0-5 year male child is most among male.
* Survival of female > 50% for all age groups.
* Survival of male < 30% for all age groups.
* Distribution of fare in train & test set is alike.
* Fare has outliers so for better understanding we break fare into thre categories- low price, med price & high price or do log transformation.
* Higher the fare higher is the chance of survival. 
* Fare increases with increase in family size.

<h3 style="background-color:azure; text-align:center; font-size:200%">2.3. Categorical variable</h3>

<span style='background:orange; font-size:150%'>**Pclass**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Pclass", hue="species", bins=5, ax=ax)
fig.suptitle('Distribution of train & test set of Pclass', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(25, 20))
ax[0][0].pie(train.groupby(['Pclass'])['Pclass'].count(), explode=(0.1, 0.1, 0.1), labels=['Class 1','Class 2', 'Class 3'], autopct='%1.1f%%', shadow=True)
sns.histplot(train,x="Pclass", ax=ax[0][1], hue_order=labels)
sns.histplot(train,x="Pclass",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Pclass",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Pclass",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Pclass',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.Pclass,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.Pclass,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex, train.Survived],train.Pclass,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex, train.Survived],train.Pclass,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex, train.Survived],train.Pclass,normalize='columns',margins=True).round(4)*100).style.background_gradient(cmap='summer_r').set_precision(2)

<span style='background:orange; font-size:150%'>**Sibling/Spouse**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="SibSp", hue="species", bins=5, ax=ax)
fig.suptitle('Distribution of train & test set of Sibling/Spouse', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['SibSp'])['SibSp'].count(), explode=(0.1,0.1,0.1,0.1,0.4,0.5,0.6), labels=['0', '1', '2', '3', '4', '5', '8'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="SibSp", ax=ax[0][1], hue_order=labels)
sns.histplot(train,x="SibSp",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="SibSp",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="SibSp",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Sibling/Spouse',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
train['sibsp']=train['SibSp'].apply(lambda x:'low' if x<1 else 'high')
df['sibsp']=df['SibSp'].apply(lambda x:'low' if x<1 else 'high')

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="sibsp", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Sibling/Spouse', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 15))
ax[0][0].pie(train.groupby(['sibsp'])['sibsp'].count(), explode=(0.1,0.1), labels=['low','high'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="sibsp", ax=ax[0][1])
sns.histplot(train,x="sibsp",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="sibsp",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="sibsp",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Sibling/Spouse',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.sibsp,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.sibsp,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.sibsp,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex,train.Survived],train.sibsp,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex,train.Survived],train.sibsp,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
df['SibSp_log']=np.log(1+df['SibSp'])

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="SibSp_log", hue="species", ax=ax, element='poly')
fig.suptitle('Distribution of train & test set of Fare log', fontsize=20)
fig.show()

<span style='background:Orange; font-size:150%'>**Parent/children**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Parch", hue="species", bins=5, ax=ax)
fig.suptitle('Distribution of train & test set of Parent/children', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['Parch'])['Parch'].count(), explode=(0.1,0.1,0.1,0.1,0.4,0.5,0.6,0.7), labels=['0', '1', '2', '3', '4', '5', '6', '9'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="Parch", ax=ax[0][1], hue_order=labels)
sns.histplot(train,x="Parch",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Parch",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Parch",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Parent/children',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
train['parch']=train['Parch'].apply(lambda x:'low' if x<1 else 'high')
df['parch']=df['Parch'].apply(lambda x:'low' if x<1 else 'high')

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="parch", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Parent/children', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 15))
ax[0][0].pie(train.groupby(['parch'])['parch'].count(), explode=(0.1,0.1), labels=['low','high'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="parch", ax=ax[0][1])
sns.histplot(train,x="parch",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="parch",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="parch",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Parent/children',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.parch,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.parch,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.parch,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex,train.Survived],train.parch,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex,train.Survived],train.parch,normalize='columns',margins=True)*100)[['low','high']].style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
df['Parch_log']=np.log(1+df['Parch'])

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Parch_log", hue="species", ax=ax, element='poly')
fig.suptitle('Distribution of train & test set of Parent/Children log', fontsize=20)
fig.show()

<span style='background:orange; font-size:150%'>**Related**</span>

In [None]:
train["related"]=train['SibSp']+train['Parch']
df["related"]=df['SibSp']+df['Parch']

In [None]:
train['related']=train['related'].apply(lambda x:'low' if x<1 else 'high')
df['related']=df['related'].apply(lambda x:'low' if x<1 else 'high')

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="related", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Relation', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['related'])['related'].count(), explode=(0.1,0.1), labels=['high','low'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="related", ax=ax[0][1], hue_order=labels)
sns.histplot(train,x="related",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="related",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="related",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Relation',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.related,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.related,normalize='columns',margins=True)[['low','high']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.related,margins=True)[['low','high']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex,train.Survived],train.related,normalize='columns',margins=True)[['low','high']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex,train.Survived],train.related,normalize='columns',margins=True)[['low','high']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
df['related_log']=np.log(1+train['SibSp']+train['Parch'])

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="related_log", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Relation log', fontsize=20)
fig.show()

<span style='background:orange; font-size:150%'>**Embarkation**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Embarked", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Embarked', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['Embarked'])['Embarked'].count(), explode=(0.1,0.1,0.1), labels=['C','Q','S'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="Embarked", ax=ax[0][1])
sns.histplot(train,x="Embarked",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Embarked",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Embarked",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Embarked',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.Embarked,margins=True)[['S','C','Q']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.Embarked,normalize='columns',margins=True)[['S','C','Q']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.Embarked,margins=True)[['S','C','Q']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train['Sex']=='male'].Sex,train.Survived],train.Embarked,normalize='columns',margins=True)[['S','C','Q']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train['Sex']=='female'].Sex,train.Survived],train.Embarked,normalize='columns',margins=True)[['S','C','Q']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

<span style='background:orange; font-size:150%'>**Name**</span>

In [None]:
print('Initials present in train set:\n',train.Name.str.extract('([A-Za-z]+)\.').value_counts())

In [None]:
print('Initials present in test set:\n',test.Name.str.extract('([A-Za-z]+)\.').value_counts())

In [None]:
len(train['Name'].str.split().str[0].unique())

In [None]:
len(test['Name'].str.split().str[0].unique())

<span style='background:orange; font-size:150%'>**Cabin**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Cabin", hue="species", ax=ax)
fig.suptitle("Distribution of Cabin in train & test set")
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['Cabin'])['Cabin'].count(), autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="Cabin", ax=ax[0][1], hue_order=labels)
sns.histplot(train,x="Cabin", hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Cabin",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Cabin",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Cabin',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.Cabin,margins=True)[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X','All']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.Cabin,margins=True, normalize='columns')[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X', 'All']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
pd.crosstab([train.Sex,train.Survived],train.Cabin,margins=True)[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X', 'All']].style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab([train[train.Sex=="male"].Sex,train.Survived],train.Cabin,margins=True,normalize="columns")[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X', 'All']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

In [None]:
(pd.crosstab([train[train.Sex=="female"].Sex,train.Survived],train.Cabin,margins=True,normalize="columns")[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X', 'All']]*100).style.background_gradient(cmap='summer_r').set_precision(2)

<span style='background:orange; font-size:150%'>**Gender**</span>

In [None]:
fig, ax=plt.subplots(facecolor='#f6f5f5', figsize=(9,5))
sns.histplot(df, x="Sex", hue="species", ax=ax)
fig.suptitle('Distribution of train & test set of Gender', fontsize=20)
fig.show()

In [None]:
fig, ax=plt.subplots(nrows=3, ncols=2, facecolor='#f6f5f5', figsize=(18, 30))
ax[0][0].pie(train.groupby(['Sex'])['Sex'].count(), explode=(0.1,0.1), labels=['female','male'], autopct='%1.1f%%', shadow=False)
sns.histplot(train,x="Sex", ax=ax[0][1])
sns.histplot(train,x="Sex",hue="Survived", ax=ax[1][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="male"],x="Sex",hue="Survived",ax=ax[2][0], hue_order=labels)
sns.histplot(train[train["Sex"]=="female"],x="Sex",hue="Survived",ax=ax[2][1], hue_order=labels)
ax[0][0].set_title("Distribution of population")
ax[0][1].set_title("Distribution of population")
ax[1][0].set_title("Survival")
ax[2][0].set_title("Survival of male")
ax[2][1].set_title("Survival of female")
fig.suptitle('Distribution of population based on Gender',fontsize=20)
ax[1][1].remove()
fig.show()

In [None]:
pd.crosstab(train.Survived,train.Sex,margins=True).style.background_gradient(cmap='summer_r')

In [None]:
(pd.crosstab(train.Survived,train.Sex,normalize='columns',margins=True)*100).style.background_gradient(cmap='summer_r').set_precision(2)

<span style='background:#5CB3FF; font-size:150%'>**Observations**</span>
* Distribution of Pclass in train & test set is different.
* Survival of people is based on the standard Pclass_1 > Pclass_2 > Pclass_3.
* Distribution of SibSp in train & test set is similar.
* SibSp has skewed distribution so we break it into low & high categories to handle the outliers.
* With increase in SibSp survival ratio is reducing for female and increasing for male.
* Distribution of Parch in train & test set is similar.
* Parch has skewed distribution so we break it into low & high categories to handle the outliers.
* With increase in Parch survival ratio is reducing for female and increasing for male.
* Since information in SibSp and Parch is similar we take it into related to get total number of relations.
* Distribution of Related in train & test set is similar.
* Related has skewed distribution so we break it into low & high categories to handle the outliers.
* With increase in relations survival ratio is reducing for female and increasing for male.
* Distribution of Embarked in train & test set is different.
* Survival of people is based on the port of embarkation C > Q > S.
* No initials present in the Name so this column will be useless for us.
* Distribution of Gender in train & test set is different.
* Survival of female is 71.36% among females.
* Survival of male is 20.22% among females.
* Overall female is the first choice of survival.

<h2 style="background-color:azure; text-align:center; font-size:300%">3. Feature Engineering</h2>

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
train=pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
print("Median age of male and female in train:\n", train.groupby(['Sex'])['Age'].median().round(2))
print("\nMedian age of male and female in test:\n", test.groupby(['Sex'])['Age'].median().round(2))

In [None]:
train.loc[(train.Age.isnull())&(train.Sex=='male'),'Age']=36.0
train.loc[(train.Age.isnull())&(train.Sex=='female'),'Age']=42.0

test.loc[(test.Age.isnull())&(test.Sex=='male'),'Age']=27.0
test.loc[(test.Age.isnull())&(test.Sex=='female'),'Age']=25.0

train['Age_log']=np.log(1+train['Age'])
test['Age_log']=np.log(1+test['Age'])

In [None]:
print(train['Age'].isnull().any())
print(test['Age'].isnull().any())

In [None]:
print("Median age of male and female in train:\n", train.groupby(['Sex'])['Fare'].median().round(2))
print("\nMedian age of male and female in test:\n", test.groupby(['Sex'])['Fare'].median().round(2))

In [None]:
train.loc[(train.Fare.isnull())&(train.Sex=='male'),'Fare']=13.58
train.loc[(train.Fare.isnull())&(train.Sex=='female'),'Fare']=28.32

test.loc[(test.Fare.isnull())&(test.Sex=='male'),'Fare']=12.96
test.loc[(test.Fare.isnull())&(test.Sex=='female'),'Fare']=28.14

train['Fare_log']=np.log(1+train['Fare'])
test['Fare_log']=np.log(1+test['Fare'])

In [None]:
print(train['Fare'].isnull().any())
print(test['Fare'].isnull().any())

In [None]:
train['Embarked'].fillna('X', inplace=True)
test['Embarked'].fillna('X', inplace=True)

In [None]:
print(train['Embarked'].isnull().any())
print(test['Embarked'].isnull().any())

In [None]:
train['Cabin'].fillna('X', inplace=True)
test['Cabin'].fillna('X', inplace=True)

train['Cabin']=[list(word)[0] for word in train['Cabin'].values]
test['Cabin']=[list(word)[0] for word in test['Cabin'].values]

In [None]:
print(train['Cabin'].isnull().any())
print(test['Cabin'].isnull().any())

In [None]:
print(train.isnull().isnull().sum())
print('\n')
print(test.isnull().isnull().sum())

In [None]:
train['Fare_cat']=pd.qcut(train['Fare'], 3, labels=['low','med','high'])
test['Fare_cat']=pd.qcut(test['Fare'], 3, labels=['low','med','high'])

train['Pclass']=train['Pclass'].apply(lambda x:1 if x in [3] else 2 if x in [2] else 3)
test['Pclass']=test['Pclass'].apply(lambda x:1 if x in [3] else 2 if x in [2] else 3)

train['SibSp']=train['SibSp']+1
test['SibSp']=test['SibSp']+1

train['Parch']=train['Parch']+1
test['Parch']=test['Parch']+1

train['related']=train['SibSp']+train['Parch']
test['related']=test['SibSp']+test['Parch']


train['related_log']=np.log(1+train['related'])
test['related_log']=np.log(1+test['related'])


train['SibSp_log']=np.log(1+train['SibSp'])
test['SibSp_log']=np.log(1+test['SibSp'])

train['Parch_log']=np.log(1+train['Parch'])
test['Parch_log']=np.log(1+test['Parch'])

train['related_cat']=train['related'].apply(lambda x:'low' if x<1 else 'high')
test['related_cat']=test['related'].apply(lambda x:'low' if x<1 else 'high')

train['Cabin']=train['Cabin'].apply(lambda x:'E' if x in ['F','G'] else x)
train['Cabin']=train['Cabin'].apply(lambda x:'C' if x in ['D'] else x)
train['Cabin']=train['Cabin'].apply(lambda x:'X' if x in ['T'] else x)
test['Cabin']=test['Cabin'].apply(lambda x:'E' if x in ['F','G'] else x)
test['Cabin']=test['Cabin'].apply(lambda x:'C' if x in ['D'] else x)
test['Cabin']=test['Cabin'].apply(lambda x:'X' if x in ['T'] else x)

In [None]:
print(train.isnull().isnull().sum())
print('\n')
print(test.isnull().isnull().sum())

In [None]:
fig, ax = plt.subplots(ncols=2, facecolor='#f6f5f5', figsize=(24, 14))
cols=['Age','Age_log','Fare','Fare_log','SibSp','SibSp_log','Parch','Parch_log','related','related_log']
dcorr=train[cols].corr('pearson')
tcorr=test[cols].corr('pearson')
mask = np.triu(np.ones_like(dcorr, dtype=np.bool))

sns.heatmap(dcorr, mask=mask, ax=ax[0], annot=True, fmt=".2f", cmap='coolwarm')
sns.heatmap(tcorr, mask=mask, ax=ax[1], annot=True, fmt=".2f", cmap='coolwarm')
ax[0].set_title('Correlation of train set')
ax[1].set_title('Correlation of test set')
fig.suptitle('Correlation Matrix', fontsize=20)
fig.show()

In [None]:
background_color = "#f6f5f5"
cols=['Age','Age_log','Fare','Fare_log','SibSp','SibSp_log','Parch','Parch_log','related','related_log']
fig = plt.figure(figsize=(12, 8), facecolor=background_color)
gs = fig.add_gridspec(1, 1)
ax0 = fig.add_subplot(gs[0, 0])

ax0.set_facecolor(background_color)
ax0.text(-1.1, 0.26, 'Correlation of Continuous Features with Target', fontsize=20, fontweight='bold', fontfamily='serif')
ax0.text(-1.1, 0.24, 'There is no features that pass 0.261 correlation with target', fontsize=13, fontweight='light', fontfamily='serif')

chart_df = pd.DataFrame(train[cols].corrwith(train['Survived']))
chart_df.columns = ['corr']
sns.barplot(x=chart_df.index, y=chart_df['corr'], ax=ax0, color='RoyalBlue', zorder=3, edgecolor='black', linewidth=1.5)
ax0.grid(which='major', axis='y', zorder=0, color='gray', linestyle=':', dashes=(1,5))
ax0.set_ylabel('')

for s in ["top","right", 'left']:
    ax0.spines[s].set_visible(False)

plt.show()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.to_csv('./train.csv',index=False)
test.to_csv('./test.csv',index=False)

In [None]:
# train3=train.copy()
# test3=test.copy()
# train3.to_csv('./train3.csv',index=False)
# test3.to_csv('./test3.csv',index=False)

In [None]:
# train1=train.copy()
# test1=test.copy()
# train1.to_csv('./train1.csv',index=False)
# test1.to_csv('./test1.csv',index=False)

In [None]:
# sc=StandardScaler()
# sc.fit(train['Age'].values.reshape(-1, 1))
# train['Age']=sc.transform(train['Age'].values.reshape(-1, 1))
# test['Age']=sc.transform(test['Age'].values.reshape(-1, 1))

In [None]:
# train['Fare']=train['Fare'].apply(lambda x:1.0 if x is 'low' else 2.0 if x is 'med' else 3.0)
# test['Fare']=test['Fare'].apply(lambda x:1.0 if x is 'low' else 2.0 if x is 'med' else 3.0)


# train['related']=train['related'].apply(lambda x:1.0 if x is 'low' else 2.0)
# test['related']=test['related'].apply(lambda x:1.0 if x is 'low' else 2.0)

In [None]:
# ohe=OneHotEncoder()
# col=['Sex','Embarked']
# ohe.fit(train[col])
# df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train[col]).toarray())
# df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test[col]).toarray())

In [None]:
# df1.head()

In [None]:
# df2.head()

In [None]:
# train=train.join(df1)
# test=test.join(df2)

In [None]:
# train.head()

In [None]:
# test.head()

In [None]:
# train2=train.copy()
# test2=test.copy()
# train2.to_csv('./train2.csv',index=False)
# test2.to_csv('./test2.csv',index=False)

In [None]:
# train.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)
# test.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)

In [None]:
# train=train.astype(float)
# test=test.astype(float)

In [None]:
# train.head()

In [None]:
test.head()

<h2 style="background-color:azure; text-align:center; font-size:300%">4. Base Model</h2>

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
len(x_train), len(y_train), len(x_test), len(y_test)

In [None]:
folds=10
kf=StratifiedKFold(n_splits=folds, random_state=57, shuffle=True)
score=pd.DataFrame()

In [None]:
xgb_test_preds = np.zeros(len(x_test), )
xgb_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    xgb = XGBClassifier(eval_metric="auc", gpu_id=0)
    
    model =  xgb.fit(xtrain, ytrain, eval_set=[(xval, yval)], early_stopping_rounds=50, verbose=True)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    xgb_test_preds+= pred_test/folds
    xgb_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {} Validation: {} Test: {}'.format(fold+1, score1, score2, score3))
    
print('OOF AUC: {}'.format(roc_auc_score(y_test, xgb_test_preds)))
# average_score['xgboost'] = xgb_test_preds
score['xgboost'] = xgb_TEST_preds

In [None]:
cat_test_preds = np.zeros(len(x_test), )
cat_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    cat = CatBoostClassifier(eval_metric="auc",task_type="GPU",devices="0")
                                
    model =  xgb.fit(xtrain, ytrain, eval_set=[(xval, yval)], early_stopping_rounds=50, verbose=True)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    cat_test_preds+= pred_test/folds
    cat_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {} Validation: {} Test: {}'.format(fold+1, score1, score2, score3))
    
print('OOF AUC: {}'.format(roc_auc_score(y_test, cat_test_preds)))
# average_score['catboost'] = cat_test_preds
score['catboost'] = cat_TEST_preds

In [None]:
lgbm_test_preds = np.zeros(len(x_test), )
lgbm_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    lgbm = LGBMClassifier(random_state=2021,n_jobs=-1)
    model =  lgbm.fit(xtrain, ytrain, eval_set=[(xval, yval)], eval_metric="auc",verbose=True)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    lgbm_test_preds+= pred_test/folds
    lgbm_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {} Validation: {} Test: {}'.format(fold+1, score1, score2, score3))
    
print('OOF AUC: {}'.format(roc_auc_score(y_test, lgbm_test_preds)))
# average_score['lgbm'] = lgbm_test_preds
score['lgbm'] = lgbm_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['xgboost']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./xgboost_basemodel.csv',index=False)

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['catboost']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./catboost_basemodel.csv',index=False)

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['lgbm']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./lgbm_basemodel.csv',index=False)

In [None]:
score['majorityvote']=.4*score['xgboost']+0.4*score['catboost']+0.2*score['lgbm']
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['majorityvote']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./majorityvote_basemodel.csv',index=False)