In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<font size=5><b>Attributes</b></font>

* <font size=3>survival-Survival-0 = No, 1 = Yes</font>
* <font size=3>pclas-Ticket class-1 = 1st, 2 = 2nd, 3 = 3rd</font>
* <font size=3>sex-Sex</font>
* <font size=3>Age-Age in years</font>
* <font size=3>sibsp-# of siblings / spouses aboard the Titanic</font>
* <font size=3>parch-# of parents / children aboard the Titanic</font>
* <font size=3>ticket-Ticket number</font>
* <font size=3>fare-Passenger fare</font>
* <font size=3>cabin-Cabin number</font>
* <font size=3>embarked-Port of Embarkation-C = Cherbourg, Q = Queenstown, S = Southampton</font>

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,StackingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
df=pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df.head()

In [None]:
X_test=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
X_test.head()

<font size=5><b>checking for missing values</b></font>

In [None]:
msno.matrix(df)

In [None]:
df.info()

<font size=4><b>Checking for unique values</b></font>

In [None]:
cols=['Survived','Pclass','Sex','SibSp',
      'Parch','Embarked']

for col in cols:
    print(f'Unique values in {col}:\n',df[col].dropna().unique())
    print('\n')

<font size=4><b>Let's analyze the categorical columns and how does the Survival affect it</b></font>

In [None]:
cols=['Pclass','Sex','SibSp',
      'Parch','Embarked']

plt.style.use('seaborn')
fig=plt.figure()
for i,col in enumerate(cols):
    fig.add_subplot(3,2,i+1)
    fig.set_size_inches(10,11)
    sns.countplot(x=col,data=df,hue='Survived',palette='tab10')
    plt.xlabel(col,fontsize=14)
    plt.ylabel('count',fontsize=14)
    plt.tight_layout()

<font size=5><b>Age</b></font>

* <font size=4><b>Exploring the class of travel and analysing age range for particular class</b></font>

In [None]:
px.density_heatmap(x='Pclass',y='Age',data_frame=df,width=550,height=450)

* <font size=4><b>Checking for frequency of a particular age group for each gender</b></font>

In [None]:
px.density_heatmap(x='Sex',y='Age',data_frame=df,width=550,height=450)

* <font size=4><b>Checking for No. of parent/children of passengers across all age groups</b></font>

In [None]:
px.density_heatmap(x='Parch',y='Age',data_frame=df,width=550,height=450)

* <font size=4><b>Checking for No. Sibling/Spouse of passengers across all age groups</b></font>

In [None]:
px.density_heatmap(x='SibSp',y='Age',data_frame=df,width=550,height=450)

* <font size=4><b>Checking for Survival rate for each of the age groups.</b></font>

In [None]:
px.density_heatmap(x='Survived',y='Age',data_frame=df,width=550,height=450)

<font size=5><b>Fare</b></font>

* <font size=4><b>How does the Fare vary across the Ticket Class ? </b></font>

In [None]:
px.density_heatmap(x='Pclass',y='Fare',data_frame=df,width=550,height=450)

* <font size=4><b>How does the Fare vary region to region?</b></font>

In [None]:
px.density_heatmap(x='Embarked',y='Fare',data_frame=df,width=550,height=450)

<font size=5><b>Statistical Analysis</b></font>

<font size=5><b>Age</b></font>

In [None]:
plt.subplot(1,2,1)
df['Age'].hist(figsize=(11,6),bins=20,edgecolor='white')
plt.xlabel('Age',fontsize=14)
plt.ylabel('count',fontsize=14)

plt.subplot(1,2,2)
df['Age'].plot.box(figsize=(11,6),boxprops={'linewidth':1.5,'color':'red'},
                   whiskerprops={'linewidth':1.5,'color':'red'},
                   medianprops={'linewidth':1.5,'color':'blue'})

<font size=5><b>Fare</b></font>

In [None]:
plt.subplot(1,2,1)
df['Fare'].hist(figsize=(11,6),bins=20,edgecolor='white')
plt.xlabel('Fare',fontsize=14)
plt.ylabel('count',fontsize=14)

plt.subplot(1,2,2)
df['Fare'].plot.box(figsize=(11,6),boxprops={'linewidth':1.5,'color':'red'},
                   whiskerprops={'linewidth':1.5,'color':'red'},
                   medianprops={'linewidth':1.5,'color':'blue'})

In [None]:
X_test_copy=X_test.copy()

<font size=4><b>Imputing missing values in categorical features with the most frequent occurrences. Imputing missing values in numerical features with mean.</b></font>

In [None]:
cat_cols=['Pclass','Sex','SibSp','Parch',
          'Ticket','Cabin','Embarked']
for cols in cat_cols:
    df[col]=df[col].replace(np.nan,df[col].mode()[0])
    X_test_copy[col]=X_test_copy[col].replace(np.nan,X_test_copy[col].mode()[0])
    
num_cols=['Age','Fare']
for col in num_cols:
    df[col]=df[col].replace(np.nan,df[col].mean())
    X_test_copy[col]=X_test_copy[col].replace(np.nan,X_test_copy[col].mean())

<font size=4><b>Detecting and removing outliers in Age and Fare features using Inter-Quartile Range</b></font>

In [None]:
q75,q25 = np.percentile(df['Age'],[75,25])
intr_qr = q75-q25
 
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
outlier_age=[]
count=0
for age in df['Age']:
    if age < min or age > max:
        outlier_age.append(age)
        count+=1
    
print('No of outliers in Age column:',count)
        

In [None]:
q75,q25 = np.percentile(df['Fare'].dropna(),[75,25])
intr_qr = q75-q25
 
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
outlier_fare=[]
count=0
for fare in df['Fare'].dropna():
    if fare < min or fare > max:
        outlier_fare.append(fare)
        count+=1
    
print('No of outliers in Fare column:',count)

<font size=4><b>Below we replace outliers in Fare feature with the mean of Fare feature</b></font>

In [None]:
for fare in outlier_fare:
    df['Fare']=df['Fare'].replace(fare,df['Fare'].mean())

<font size=4><b>Checking for outliers after imputing the Fare column</b></font>

In [None]:
df['Fare'].plot.box(figsize=(9,5),boxprops={'linewidth':1.5,'color':'red'},
                   whiskerprops={'linewidth':1.5,'color':'red'},
                   medianprops={'linewidth':1.5,'color':'blue'})

x=[x for x in df['Fare'] if x< min or x > max]
print('Outliers after imputing:',len(x))

In [None]:
cols=['Ticket','Cabin']
for col in cols:
    df[col]=df[col].fillna(method='ffill')
    X_test_copy[col]=X_test_copy[col].fillna(method='ffill')

<font size=4><b>Splitting the Cabin attribute and creating features Cabin code and Cabin number</b></font>

In [None]:
codes=df['Cabin'].str.split('(?<=\D)(?=\d)')

df['Cabin_code']=[code[0] for code in codes]
X_test_copy['Cabin_code']=[code[0] for code in codes]

df['Cabin_number']=[code[1] for code in codes]
X_test_copy['Cabin_number']=[code[1] for code in codes]

In [None]:
df['Cabin_number']=df['Cabin_number'].astype('int')
X_test_copy['Cabin_number']=X_test_copy['Cabin_number'].astype('int')

In [None]:
df=df.drop(['Ticket','Cabin'],axis=1)

In [None]:
df.head()

In [None]:
X=df.drop(['Survived','Name'],axis=1)
y=df.Survived

In [None]:
X_test_copy=X_test_copy.drop(['Ticket','Cabin','Name'],axis=1)

<font size=4><b>Encoding Categorical features</b></font>

In [None]:
le=LabelEncoder()
#train data
X['Sex']=le.fit_transform(X['Sex'])
X['Embarked']=le.fit_transform(X['Embarked'])
X['Cabin_code']=le.fit_transform(X['Cabin_code'])

#test data
X_test_copy['Sex']=le.fit_transform(X_test_copy['Sex'])
X_test_copy['Embarked']=le.fit_transform(X_test_copy['Embarked'])
X_test_copy['Cabin_code']=le.fit_transform(X_test_copy['Cabin_code'])

<font size=5><b>Scaling</b></font>

In [None]:
X=(X-X.mean())/X.std()
X_test_copy=(X_test_copy - X_test_copy.mean())/X_test_copy.std()

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
models=[RandomForestClassifier,GradientBoostingClassifier,
        AdaBoostClassifier,LGBMClassifier]
for model in models:
    modell=model()
    modell.fit(X_train,y_train)
    preds=modell.predict(X_val)
    print(modell)
    print(classification_report(y_val,preds))
    print('-'*25)
    print('\n\n')

<font size=5><b>Stacking models</b></font>

In [None]:
step0=[]
step0.append(('rf',RandomForestClassifier()))
step0.append(('gb',GradientBoostingClassifier()))
step0.append(('ada',AdaBoostClassifier()))
step0.append(('lgbm',LGBMClassifier()))


step1=LogisticRegression()

model=StackingClassifier(estimators=step0,final_estimator=step1,cv=5)

model.fit(X_train,y_train)
preds=model.predict(X_val)
print('classification report:\n',classification_report(y_val,preds))
print('confusion matrix:\n',confusion_matrix(y_val,preds))

In [None]:
predictions=model.predict(X_test_copy)

In [None]:
submission=pd.DataFrame({'PassengerId':X_test['PassengerId'],
                        'Survived':predictions})
submission.to_csv('my_submissions.csv',index=False)