In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

### This is my first Kaggle Competition and I thought it would be great to share my notebook. I really appreciate any feedback and suggestions. And if you find it helpful -please vote.

# Loading the data

In [None]:
df_train=pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df_test=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df_train

In [None]:
df_test

In [None]:
X_train=df_train.drop(columns=["Survived"])
Y_train=df_train["Survived"]

In [None]:
df_joined=pd.concat([X_train,df_test],ignore_index=True)
df_joined

# EDA

In [None]:
df_joined.info()

In [None]:
# We can see we have Null values in our data
# Let's count the number of examples having NULL values
df_joined.isnull().sum()

**Lets's explore the numerical features i.e. Age and Fare, to see if we need to drop them or impute them**
### Fare

In [None]:
fig,axs = plt.subplots(ncols = 2,figsize = (17,4))

sns.distplot(x = df_joined['Fare'] , ax = axs[0])
sns.boxplot(x = 'Fare' , data = df_joined , color = 'red' , ax = axs[1])

for i in range(2):
    axs[i].set_ylabel('')
    axs[i].set_xlabel('Fare')
    axs[i].tick_params(axis='x', labelsize=10)
    axs[i].tick_params(axis='y', labelsize=10)

axs[0].set_title('Distribution (Fare)', fontsize=13)
axs[1].set_title('Boxplot For Fare', fontsize=13)
plt.show()

As we can see from the Boxplot that anything greater than 100 looks like an outlier, and from the distribution graph we see that the data is left skewed. Thus we will go up to impute the NaN's with the median. 

**Let's look over the Age feature**


### Age

In [None]:
fig,axs = plt.subplots(ncols = 2,figsize = (17,4))

sns.distplot(x = df_joined['Age'] , ax = axs[0])
sns.boxplot(x = 'Age' , data = df_joined , color = 'red' , ax = axs[1])

for i in range(2):
    axs[i].set_ylabel('')
    axs[i].set_xlabel('Age')
    axs[i].tick_params(axis='x', labelsize=10)
    axs[i].tick_params(axis='y', labelsize=10)

axs[0].set_title('Distribution (Age)', fontsize=13)
axs[1].set_title('Boxplot For Age', fontsize=13)
plt.show()

Boxplot clearly depicts that our data has no outliers and the distribution looks like a normal distribution, so we'll go up to impute the NaN's with the mean

#### **Now we know how to impute Age and Fare, So let's move on further**

### Embarked

In [None]:
# First we'll encode the data
df_joined['Embarked'].unique()

In [None]:
dic_embarked={'S':1,'C':2,'Q':3}
df_joined['Embarked'] = df_joined['Embarked'].map(dic_embarked)

In [None]:
df_joined['Embarked'].unique()

In [None]:
fig,axs = plt.subplots(ncols = 2,figsize = (17,4))

sns.distplot(x = df_joined['Embarked'] , ax = axs[0])
sns.boxplot(x = 'Embarked' , data = df_joined , color = 'red' , ax = axs[1])

for i in range(2):
    axs[i].set_ylabel('')
    axs[i].set_xlabel('Embarked')
    axs[i].tick_params(axis='x', labelsize=10)
    axs[i].tick_params(axis='y', labelsize=10)

axs[0].set_title('Distribution (Embarked)', fontsize=13)
axs[1].set_title('Boxplot For Embarked', fontsize=13)
plt.show()

We can see in the Boxplot that majority of are data belong to class S(1) and C(2), and that most of them belong to s(1), as seen from distribution graph.
Thus we can use KNN Imputer to replace the NaN's

### Let's Impute the values so far

In [None]:
imputer = KNNImputer()
df_joined['Embarked']= pd.Series(imputer.fit_transform(df_joined['Embarked'].values.reshape(-1,1)).flatten())
df_joined['Age'] = df_joined['Age'].replace(np.nan,df_joined['Age'].mean())
df_joined['Fare'] = df_joined['Fare'].replace(np.nan,df_joined['Fare'].median())

In [None]:
# Having a look over the NaN count again
missing_count=df_joined.isnull().sum()
percent_missing = missing_count * 100 / len(df_joined)
print(f"Missing Count \n\n{missing_count}\n\nPercentage Missing\n\n{percent_missing}")

As we can see from the percentage table 70%(approx.) of teh Cabin data is missing. Thus, we'll be dropping the Cabin column as of now and will come back to it and see if it will affect our model score.
Also 5%(approx.) of the Ticket data is missing which is 9804 examples out of 200000 thus droping them off will not adversly affect our data size.

In [None]:
df_joined.drop(columns=['Cabin'],inplace=True)

In [None]:
# We'll seperate the X_train and X_test data and append Y_train to X_train to drop the corresponding target variables

# Mapping the Sex attribute
dic_sex={'male':0,'female':1}
df_joined['Sex']=df_joined['Sex'].map(dic_sex)

X_train =  df_joined[df_joined['PassengerId']<df_train.shape[0]]
X_test =  df_joined[df_joined['PassengerId']>=df_train.shape[0]]


In [None]:
# X_train.dropna(inplace=True)
# X_test.dropna(inplace=True)

In [None]:
# # Seperating X_train and Y_train
# Y_train=X_train.iloc[:,-1]
# X_train=X_train.iloc[:,:-1]

In [None]:
print(f"Missing Count  \n\n{X_train.isnull().sum()}\n\nMissing Count in X_test\n\n{X_test.isnull().sum()}")


#### **Now, we have removed all the NaN's from our data and since we are working on a supervised ML problem we should also look at the relationship between the dependent variable and independent variable.**

Before that we'll be dropping off the Name and Ticket features as they are a bit random. But we'll come back to them to if they add anything to the model performance

In [None]:
X_train.drop(columns=['Name','Ticket'],inplace=True)
X_test.drop(columns=['Name','Ticket'],inplace=True)

In [None]:
X_train.corrwith(Y_train).plot.bar(figsize=(15,10),title="Correlation with response variable",fontsize=15,rot=90, color = 'red', grid=True )

We can see that PassengerId , SibSp , Parch aren't that much correlated with the target variable. So what we'll do is we'll drop the PassengerId, SibSp , Parch. 

In [None]:
X_train.drop(columns=['SibSp','Parch','PassengerId'],inplace=True)
X_test.drop(columns=['SibSp','Parch','PassengerId'],inplace=True)

In [None]:
X_train.corrwith(Y_train).plot.bar(figsize=(15,10),title="Correlation with response variable",fontsize=15,rot=90, color = 'red', grid=True )

In [None]:
X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

In [None]:
def stand_scaler(df):
    scaler=StandardScaler()
    scaled_value=scaler.fit_transform(df[['Age','Fare']].values)
    df[['Age','Fare']]=pd.DataFrame(scaled_value,index=df[['Age','Fare']].index,columns=df[['Age','Fare']].columns)

In [None]:
stand_scaler(X_train)
stand_scaler(X_test)

In [None]:
def encoder(df):
    transformer=ColumnTransformer([('encoder',OneHotEncoder(),[0,1,4])],remainder='passthrough')
    df=np.array(transformer.fit_transform(df))
    return df

In [None]:
X_train=encoder(X_train)
X_test=encoder(X_test)

In [None]:
X_test

# Model Implementation

In [None]:
# let's split our X_train data
x_train,x_test,y_train,y_test=train_test_split(X_train,Y_train,test_size=0.2)

In [None]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
estimators = [('knn',KNeighborsClassifier()),('lr',LogisticRegression()),('dtr',DecisionTreeClassifier()),('rf',random_forest)]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))

In [None]:
# It seen out that stacking the classifiers performed much better.

## Submitting the Predictions

In [None]:
result = lr.predict(X_test)
pd.DataFrame({'PassengerId' : df_test['PassengerId'] , 'Survived': result}).to_csv("my_submission2.csv",index=False)

In [None]:
result = lr.predict(X_test)
result

In [None]:
result.shape