In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp

# EDA

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

Here we can observe that Age and Cabin attributes have lot of null values whereas Embarked has only 2 null values

In [None]:
train.describe()

In [None]:
train.describe(include='O')

The above command runs describe on categorical features

In [None]:
train.isnull().sum()

 So it can be observed that Age has 177 null values Cabin has 687 and embarked has 2

### Analysing test data as well

In [None]:
test.head()

In [None]:
test.shape

So test has 418 rows and 11 columns and train has 891 rows 
Therefore total 1309 rows

In [None]:
test.info()

In [None]:
test.isnull().sum()

So it can be observed that in test data Age has 86 missing values whereas Cabin has 327

In [None]:
#Printing percentage of passengers survived
survived = train[train['Survived']==1]
non_survived = train[train['Survived']==0]

print(f"survived:{len(survived)/len(train['Survived'])*100:.2f}%"'\n'
     f"not-survived:{len(non_survived)/len(train['Survived'])*100:.2f}%")

From the above cell it can be seen that approximately 1/3 of the passengers in the training dataset survived

# Feature and Survival realtions

## Sex vs Survived

In [None]:
#Checking percentage of females and males who survived

train.groupby('Sex')['Survived'].mean()*100

In [None]:
pd.crosstab(train['Sex'],train['Survived'])

### So it is observed that females have a higher rate of survival than males

In [None]:
sns.barplot(x='Sex',y='Survived',data=train,palette='plasma')

## Pclass vs Survived

In [None]:
train['Pclass'].value_counts()

In [None]:
pd.crosstab(train['Pclass'],train['Survived'])

In [None]:
train.groupby('Pclass')['Survived'].mean()*100

In [None]:
sns.barplot(x=train['Pclass'],y=train['Survived'],palette='plasma')

### It can be observed that passengers who travelled in Class 1 had higher rate of survival  as against passengers who travelled by other two classes

## Pclass,Sex vs Survived

In [None]:
sns.barplot(x='Pclass',y='Survived',hue='Sex',data=train)

### Here it can be observed that 
1. Females in class 1 and 2 have a very high rate of survival
2. Females in class 3 have a good rate of survival
3. Males in class 1 and 2 have less rate of survival
4. Males in class 3 have the least rate of survival

In [None]:
sns.factorplot(x='Sex',y='Survived',hue='Pclass',height=4,aspect=2,data=train)

1.Females in 1 and 2 class have nearly 100% chance at survival
2. Males in 3 class have less than 10% chance at survival

In [None]:
sns.factorplot(x='Pclass',y='Survived',hue='Sex',col='Embarked',data=train)

## Embarked vs Survival

In [None]:
pd.crosstab(train['Embarked'],train['Survived'])

In [None]:
print(f"{train.groupby('Embarked')['Survived'].sum()}{train.groupby('Embarked')['Survived'].mean()*100}")

It can be observed that 
1. Passengers who embarked from port C had highest rate of survival
2. But highest number of passengers (217) survived who boarded from port S
3. So port has had the highest number of passengers boarding

In [None]:
sns.barplot(x='Embarked',y='Survived',palette='plasma',data=train)

## Parch vs Survival

In [None]:
train['Parch'].value_counts()

In [None]:
train.groupby('Parch')['Survived'].mean()*100

#### So it can be observed that passengers with Parch between 1-3 had the highest rate of survival

In [None]:
sns.barplot(x='Parch',y='Survived',data=train)

## SibSp vs Survival

In [None]:
train['SibSp'].value_counts()

In [None]:
train.groupby('SibSp')['Survived'].mean()*100

#### Passengers with SibSp 1 or 2 had high chances of survival

## Age vs Survival

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

sns.violinplot(x="Embarked", y="Age", hue="Survived", data=train, split=True, ax=ax1)
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=train, split=True, ax=ax2)
sns.violinplot(x="Sex", y="Age", hue="Survived", data=train, split=True, ax=ax3)


### From Pclass violinplot, we can see that:

1. 1st Pclass has very few children as compared to other two classes.
2. 1st Plcass has more old people as compared to other two classes.
3. Almost all children (between age 0 to 10) of 2nd Pclass survived.
4. Most children of 3rd Pclass survived.
5. Younger people of 1st Pclass survived as compared to its older people.
### From Sex violinplot, we can see that:

1. Most male children (between age 0 to 14) survived.
2. Females with age between 18 to 40 have better survival chance.

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(train.drop('PassengerId',axis=1).corr(),annot=True,cmap='coolwarm')

1. A Positive correlation means as one quantity increases other increases too and vice-versa.
2. A Negative correlation means as one quantity decreases other increases and vice-versa.

## Feature Extraction and Feature Engineering

In [None]:
train_test_dataset = [train,test]
for data in train_test_dataset:
    print(data.shape)

### Name feature

In [None]:
train['Name']

In [None]:
def name_extract(col):
    name = col.split('.')[0].split(',')[1]
    
    return name

In [None]:
for data in train_test_dataset:
    data['Title']=data['Name'].apply(name_extract)

In [None]:
train.head()

New column title has been added

In [None]:
pd.crosstab(train['Title'],train['Sex'])

The DataFrame above shows all the titles and their counts we can now replace some of the lesser titles with
title "Other"

In [None]:
def name2(cols):
    name = cols.strip()
    L1=['Dr','Rev','Major','Col','Capt','Don','Jonkheer','Sir','Lady','the Countess','Dona']
    L2=['Mlle','Ms']
    if name in L1:
        return "Other"
    if name in L2:
        return "Miss"
    if name == 'Mme':
        return 'Mrs'
    return name

In [None]:
for data in train_test_dataset:
    data['Title'] = data['Title'].apply(name2)

In [None]:
train['Title'].value_counts()

In [None]:
train.groupby('Title')['Survived'].mean()*100

Now lets convert the categorical values into numerical according to the rate of survival in descending order

In [None]:
title_map = {'Mrs':1,'Miss':2,'Master':3,'Other':4,'Mr':5}
for data in train_test_dataset:
    data['Title']=data['Title'].map(title_map)
    data['Title'] = data['Title'].fillna(0)

In [None]:
train['Title']

So our categorical variables have been sucessfully mapped

### Sex feature

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)

In [None]:
train = pd.concat([train,sex],axis=1)

In [None]:
train.head()

In [None]:
sex = pd.get_dummies(test['Sex'],drop_first=True)
test=pd.concat([test,sex],axis=1)

In [None]:
test.head()

### Embarked Feature

In [None]:
for data in train_test_dataset:
    print(data['Embarked'].isnull().sum())

As there are 2 null values in Embarked we need to fill them before getting the dummies

In [None]:
train['Embarked'].mode()

As 'S' is the most embarked port we will fill nan with 'S'

In [None]:
train['Embarked'].fillna('S',inplace=True)

In [None]:
train['Embarked'].isnull().sum()

In [None]:
embarked = pd.get_dummies(train['Embarked'],drop_first=True)
train = pd.concat([train,embarked],axis=1)
train.head()

In [None]:
embarked = pd.get_dummies(test['Embarked'],drop_first=True)
test = pd.concat([test,embarked],axis=1)
test.head()

### Age Feature

In [None]:
sns.boxplot(x=train['Pclass'],y=train['Age'])

### From aboved box plot it can be observed that
1. For Pclass 1 median age is 37.
2. For Pclass 2 median age is 29.
3. For Pclass 3 median age is 24.

In [None]:
def age_impute(cols):
    age=cols[1]
    pclass=cols[0]
    if pclass==1 and pd.isnull(age):
        return 37
    elif pclass==2 and pd.isnull(age):
        return 29
    elif pclass==3 and pd.isnull(age):
        return 24
    else:
        return age
        

In [None]:
sns.heatmap(train.isnull(),cbar=False,yticklabels=False)

In [None]:
train['Age'] = train[['Pclass','Age']].apply(age_impute,axis=1)

In [None]:
test['Age'] = test[['Pclass','Age']].apply(age_impute,axis=1)

In [None]:
for data in train_test_dataset:
    print(data['Age'].isnull().sum())

So here we can observe that we have completely removed all the nan values

In [None]:
train['AgeBand']=pd.cut(train['Age'],5)
test['AgeBand']=pd.cut(test['Age'],5)

In [None]:
train.head()

Now using the pandas cut function we have created a new column AgeBand that divides age into 5 ranges

In [None]:
train.groupby('AgeBand')['Survived'].mean()*100

Now we have to map age according to ageband

In [None]:
for dataset in train_test_dataset:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

Here we have labeled age in descending order of survival rate in that age band

In [None]:
train.head()

## Fare Feature

First lets replace nan values

In [None]:
for data in train_test_dataset:
    print(data['Fare'].isnull().sum())
    dataset['Fare'].fillna(dataset['Fare'].median(),inplace=True)

In [None]:
train['FareBand'] = pd.qcut(train['Fare'], 4)
train.groupby('FareBand')['Survived'].mean()*100

It can be noted that with higher fair price survival chance increases

In [None]:
for dataset in train_test_dataset:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
train.head()

### SibSp and Parch feature

creating a feature FamilySize

In [None]:
for dataset in train_test_dataset:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

In [None]:
train.groupby('FamilySize')['Survived'].mean()*100

### From above we can observe that:
1. FamilySize 2-4 have the highest chances of survival
2. And extereme familysize like 8-11 have nearly no chance of survival

Now lets create another feature to check if a passenger was travelling alone

In [None]:
for data in train_test_dataset:
    data['Alone'] = 0
    data.loc[data['FamilySize']==1,'Alone']=1
train.groupby('Alone')['Survived'].mean()*100

So passengers travelling alone have only 30% of survival chances

In [None]:
train.head()

## Feature Selection

In [None]:
feature_drop = ['Name','Sex','AgeBand','Ticket','Cabin','Embarked','SibSp','Parch']
for data in train_test_dataset:
    data.drop(feature_drop,axis=1,inplace=True)

In [None]:
train.drop('FareBand',axis=1,inplace=True)

In [None]:
test.head()

In [None]:
train.head()

## Now we are done with feature engineering and feature selection

## Training and Selecting Model for classification

In [None]:
X=train.drop(['PassengerId','FamilySize','Survived'],axis=1)
y=train['Survived']
X_TEST=test.drop(['PassengerId','FamilySize'],axis=1)

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=101)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

GridSearchCV will be used for hyper parameter optimization


In [None]:
model_params ={
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'parameters':{
            'n_estimators':[1,5,10,30,50,100,200,300]
        }
    },
    'LogisticRegression':{
        'model':LogisticRegression(),
        'parameters':{
            'solver': ['liblinear','sag','saga']
        }
    },
    'SVC':{
        'model':SVC(gamma='auto'),
        'parameters':{
            'kernel':['rbf'],
            'C':[1,2,5,10,20,50,80,100]
        }
    },
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'parameters':{
            'criterion':['gini','entropy']
        }
    },
    'KNN':{
        'model':KNeighborsClassifier(),
        'parameters':{
            'n_neighbors':[1,5,10,20,30,50,100]
        }
    }
}

First lets check Logistic regression before optimizing the other classifiers

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred_lr = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,pred_lr))

So it can be observed that we get 79% accuracy using Linear Regression

Now moving on to hyper parameter optimization for model_params

### DecisionTreeClassifier

In [None]:
grid = GridSearchCV(DecisionTreeClassifier(),model_params['DecisionTreeClassifier']['parameters'],cv=5,return_train_score=False)
grid.fit(X_train,y_train)
pred_dc=grid.predict(X_test)

In [None]:
print(classification_report(y_test,pred_dc))

RandomForestClassifier

In [None]:
grid = GridSearchCV(RandomForestClassifier(),model_params['RandomForestClassifier']['parameters'],cv=5,return_train_score=False)
grid.fit(X_train,y_train)
pred_rfc=grid.predict(X_test)

In [None]:
print(classification_report(y_test,pred_rfc))

Support-Vector-Machine

In [None]:
grid = GridSearchCV(SVC(),model_params['SVC']['parameters'],cv=5,return_train_score=False)
grid.fit(X_train,y_train)
pred_svm=grid.predict(X_test)

In [None]:
print(classification_report(y_test,pred_svm))

MultinominalNB

In [None]:
mn = MultinomialNB()
mn.fit(X_train,y_train)
print(classification_report(y_test,mn.predict(X_test)))

In [None]:
grid = GridSearchCV(KNeighborsClassifier(),model_params['KNN']['parameters'],cv=5,return_train_score=False)
grid.fit(X_train,y_train)
pred_knn=grid.predict(X_test)
print(classification_report(y_test,pred_knn))

### From above it can be observed that SVM gave the best prediction results

## Fitting training dataset

run svm tab again before running this tab

In [None]:
grid.fit(X,y)
grid.best_params_

In [None]:
pred_svm_test = grid.predict(X_TEST)

In [None]:
df = pd.DataFrame()

In [None]:
df['PassengerId']=test['PassengerId']
df['Survived']=pred_svm_test

In [None]:
df.set_index('PassengerId',inplace=True)

In [None]:
#df.to_csv('submission1.csv')

## Deep Learning Approach

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stop = EarlyStopping(mode='min',monitor='val_loss',verbose=1,patience=25)

In [None]:
model = Sequential()

model.add(Dense(units=8,activation='sigmoid'))
model.add(Dense(units=8,activation='sigmoid'))
model.add(Dense(units=8,activation='sigmoid'))

model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy')

In [None]:
model.fit(x=X_train.values,y=y_train.values,validation_data=(X_test.values,y_test.values),
         callbacks=early_stop,epochs=700,verbose=1)

In [None]:
pd.DataFrame(model.history.history).plot()

In [None]:
model.fit(x=X,y=y,epochs=400,callbacks=early_stop,verbose=1)

In [None]:
pred = model.predict_classes(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(y_test,pred)

In [None]:
pred = model.predict_classes(X_TEST)

In [None]:
df = pd.DataFrame()

In [None]:
df['PassengerId']=test['PassengerId']
df['Survived']=pred

In [None]:
df.set_index('PassengerId',inplace=True)

In [None]:
df.to_csv('submission13.csv')