In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ANN
from keras.models import Sequential
from keras.layers import Dense
from keras.initializers import glorot_uniform
from keras.layers import LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Combine the training and testing data into a single DataFrame
df = pd.concat([df_train, df_test], ignore_index=True)
# By combining the training and testing data into a single DataFrame, 
# you can perform data preprocessing, feature engineering, and model training on the entire dataset.

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.columns

In [None]:
df

In [None]:
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

## **Data Preprocessing**

Here we perform data cleaning and finding relation of these parameters with "survival"

### **Pclass (Passnger Class)**

Ticket class

1st = Upper<br>
2nd = Middle<br>
3rd = Lower<br>

In [None]:
df['Pclass'].isnull().sum(axis=0)
# To check is the column is clean,i.e.,There are no Null(NaN) values

In [None]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).count()
# Counting how many passengers were for each Pclass

In [None]:
df.loc[df['Survived'] == 1, ['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).count()
# Counting how many people survived based on Pclass

In [None]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=True).mean()

# It calculates the mean of the "Survived" column for each unique value of "Pclass" 
# using the groupby function and displays the result.

**`df[['Pclass', 'Survived']]`** - selects only the "Pclass" and "Survived" columns from the DataFrame **`df`**.<br>
**`groupby(['Pclass'], as_index=False)`** - groups the selected columns by the unique values of "Pclass". The as_index=False parameter ensures that the resulting DataFrame has a flat structure without the grouped column becoming the index,i.e., `False` values enures Pclass values are not used as index rather a separte index is used.<br>
**`.mean()`** - calculates the mean value of the "Survived" column for each group.<br>

**NOTE -** As we can see that survival of a passenger depends upon the type of class seat booked, mostt of the people booked Pclass=2 but as per mean value very less survived and most of the people survived were Pclasss=3 as per the average.

**We can see that a higher class (lower value) has a higher survival rate. This should be a very useful feature.**

### **Name**

In [None]:
df.Name

**Each name has a title, which contains information of gender or status**

In [None]:
df['Title'] = df.Name.map( lambda x: x.split(',')[1].split( '.' )[0].strip())
# Creating another column in 'df' for "Title"

The 'Title' column is derived from the 'Name' column using string manipulation techniques.

**`.map( lambda x: x.split(',')[1].split( '.' )[0].strip())`** - applies a lambda function to each value in the 'Name' column, extracting the title information and assigning it to the corresponding row in the 'Title' column.

**`x.split(',')`** - splits the 'Name' value by comma, resulting in a list of two parts.<br>
**`x.split(',')[1]`** - selects the second part of the list, which contains the First name and title.<br>
**`.split('.')[0]`** - further splits the selected part by period, resulting in a list of two parts.<br>
**`.split('.')[0]`** - selects the first part of the list, which represents the title.<br>
**`.strip()`** - removes any leading or trailing whitespaces from the extracted title.<br>



In [None]:
df['Title'].value_counts()

**We can observe that most of the titles are "Mr", "Miss", "Mrs", "Master", "Rev", "Dr" and rest shall be considered as "Others".**

In [None]:
df.loc[~df.Title.isin(['Master', 'Mr', 'Miss', 'Mrs', 'Rev', 'Dr']), 'Title'] = 'Others'

In [None]:
df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

NOTE - We can remove "Rev"

In [None]:
df.loc[~df.Title.isin(['Master', 'Mr', 'Miss', 'Mrs', 'Dr']), 'Title'] = 'Others'

In [None]:
df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
df['Title'].value_counts()

In [None]:
df = pd.concat([df, pd.get_dummies(df['Title'])], axis=1).drop(labels=['Name'], axis=1)

The code you provided performs two operations on the DataFrame df. It uses the pd.concat() function and the pd.get_dummies() function to create dummy variables for the 'Title' column, and then drops the 'Name' column from the DataFrame.

**`pd.get_dummies(df['Title'])`** - creates dummy variables for the 'Title' column, which converts categorical values into binary columns.<br>
**`pd.concat([df, pd.get_dummies(df['Title'])], axis=1)`** - concatenates the original DataFrame df with the dummy variables DataFrame along the columns (axis=1).<br>
**`.drop(labels=['Name'], axis=1)`** - drops the 'Name' column from the concatenated DataFrame.

**Why do we need dummy variables?**

Data will sometimes come in string format and this information has to somehow be incorporated into the model training. As, ML models only understand numerical data so we need to convert those string values to numerical data.
Dummy variables are numerical variables that represent the actual data. For Instance, given male and female gender, you could give 0 to represent the males and 1 for the females. This gives an actual representation of the information and and the information is in numerical format which can be integrated into the machine learning model.

In [None]:
df

**Dummy Trap** - The Dummy Variable Trap occurs when two or more dummy variables created by one-hot encoding are highly correlated (multi-collinear). This means that one variable can be predicted from the others, making it difficult to interpret predicted coefficient variables in regression models.

**SOLUTION** - To avoid dummy variable trap we should always add one less (n-1) dummy variable( .drop ) then the total number of categories present in the categorical data (n) because the nth dummy variable is redundant as it carries no new information.

In [None]:
# df = df.drop(['Title','Dr'] , axis = "columns")
# DROPING THE DUMMY VARIABLE TRAP
# NOT RIGHT NOW AS DROPING IT NOW WOULD CAUSE HINDERANCE IN STUDYING IT WITH OTHER PARAMETERS

In [None]:
df

### **Sex**

In [None]:
# check if there is any NAN
df.Sex.isnull().sum(axis=0)

In [None]:
# correlation between Sex and Survived
df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
# map the two genders to 0 and 1
df.Sex = df.Sex.map({'male':0, 'female':1})

# The code you provided maps the two genders 'male' and 'female' in the 'Sex' column to numerical values 0 and 1, respectively, in the DataFrame df

In [None]:
df

### **Family Relations (SibSp and Parch)**

In [None]:
df.SibSp.isnull().sum(axis=0), df.Parch.isnull().sum(axis=0)

In [None]:
df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

In [None]:
df['SibSp'].value_counts()

**`As we can observe that the average survival chance is more for the people who have SibSp <= 2, for 3 and 4 we have some survival but is cannot be part of generality as no. people with SibSp=2 is double of 3 and 4.<br>
So, in order to maintain generality we will categorise SibSp>2 into a single category of 3.`**

In [None]:
df.SibSp = df.SibSp.map(lambda x: 3 if x > 2 else x)

# If a value in the 'SibSp' column is greater than 4, it is replaced with 0; otherwise, it remains unchanged

In [None]:
df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

 **Hence, we can conclude that people having a small group of siblings and spouses have a higher chance of survival rather than those who have a larger family.** 

In [None]:
df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean()

In [None]:
df['Parch'].value_counts()

**`As we did for SibSp we are getting same pattern for Parch people with group of 2 family memebers have a higher chance than >2`**

In [None]:
df.Parch = df.Parch.map(lambda x: 3 if x > 2 else x)

In [None]:
df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean()

### **Ticket**

In [None]:
df.Ticket.isnull().sum(axis=0)

In [None]:
df['tic'] = df.Ticket

In [None]:
df.tic = df.tic.map(lambda x: x[0])

# inspect the correlation between Ticket and Survived
df[['tic', 'Survived']].groupby(['tic'], as_index=False).mean()

df[['tic', 'Fare']].groupby(['tic'], as_index=False).mean()

In [None]:
df.Ticket

In [None]:
df.Ticket.head(30)

**`From above two cell we have two types of tickets 1.) Numbers
                                                 2.) Alphabets + Numbers`**

In [None]:
import re
df.Ticket = df.Ticket.map(lambda x: re.findall(r'[A-Za-z]+', x)[-1] if isinstance(x, str) and re.findall(r'[A-Za-z]+', x) else x[0])

In [None]:
df[['Ticket', 'Survived']].groupby(['Ticket'], as_index=False).mean()

In [None]:
df[['Ticket', 'Fare']].groupby(['Ticket'], as_index=False).mean()

In [None]:
df['Ticket'].value_counts()

**As we can see most of the tickets bought were 1, 2, 3, PC, A, C rest can be set to "4"**

In [None]:
df['Ticket'] = df['Ticket'].replace(['O','CA','Q','PARIS','P','7','4','6','OQ','PP','Paris','LINE','AH','5','AQ','8','SC','9','S','Basle','W','SOTON','Fa','LP'], '4')

In [None]:
df[['Ticket', 'Survived']].groupby(['Ticket'], as_index=False).mean()

In [None]:
df = pd.get_dummies(df,columns=['Ticket'])

In [None]:
#df = df.drop(['Ticket_4'] , axis = "columns")
# Droping right now would create error in "Fare"

In [None]:
df

### **Fare**

In [None]:
# check if there is any NAN
df.Fare.isnull().sum(axis=0)

*It is clearly visible there is a single null value. So, we nee to replace it with appropriate values.*<br>
We need to find appropriate values for those null values for each corresponding category, fare mainly depends upon the type of ticket, cabin, Pclass, Embarkment.

**Locating the Fare NaN value for each Column**

*It is clearly visible there is a single null value. So, we nee to replace it with appropriate values.*<br>
We need to find appropriate values for those null values for each corresponding category, fare mainly depends upon the type of ticket, cabin, Pclass, Embarkment.

In [None]:
df.tic[df.Fare.isnull()]
# The output format shows the index of the row (1043 in this case), followed by the 'Ticket' value (3701), 
# This information suggests that the row with index 1043 in the 'Ticket' column has a null 'Fare' value.

In [None]:
df.Pclass[df.Fare.isnull()]

In [None]:
df.Cabin[df.Fare.isnull()]

**We shall ignore the "Cabin" because more than 50% data is missing**

In [None]:
df.Embarked[df.Fare.isnull()]

In [None]:
guess_Fare = df.Fare.loc[ (df.tic == '3') & (df.Pclass == 3) & (df.Embarked == 'S')]
guess_Fare

**We shal consider the median of these values**

In [None]:
guess_Fare = df.Fare.loc[ (df.tic == '3') & (df.Pclass == 3) & (df.Embarked == 'S')].median()
df.Fare.fillna(guess_Fare , inplace=True)

In [None]:
# inspect the mean Fare values for people who died and survived
df[['Fare', 'Survived']].groupby(['Survived'],as_index=False).mean()

In [None]:
grid = sns.FacetGrid(df, hue='Survived', height=4, aspect=1.5)
grid.map(plt.hist, 'Fare', alpha=.5, bins=range(0, 200, 5))
grid.add_legend()
plt.show()


**We can see that as the fare increases the number of people surviving also increases**

In [None]:
# bin Fare into five intervals with equal amount of people
df['Fare-bin'] = pd.qcut(df.Fare,10,labels=[1,2,3,4,5,6,7,8,9,10]).astype(int)
df
# We hav devided the the "Fare" column into 10 diffrent range categories

In [None]:
# inspect the correlation between Fare-bin and Survived
df[['Fare-bin', 'Survived']].groupby(['Fare-bin'], as_index=False).mean()

### **Embarked**

In [None]:
# check if there is any NAN
df.Embarked.isnull().sum(axis=0)

In [None]:
df.tic[df.Embarked.isnull()]

In [None]:
df.Fare[df.Embarked.isnull()]

In [None]:
df.Pclass[df.Embarked.isnull()]

In [None]:
guess_embark = df.Embarked.loc[ (df.tic == '1') & (df.Pclass == 1) & (df.Fare == 80.0)]
guess_embark

**`Since only these two missing values have these three parameter value so we will try to remove "Fare" parameter`**

In [None]:
guess_embark = df.Embarked.loc[ (df.tic == '1') & (df.Pclass == 1)]
guess_embark

In [None]:
guess_embark = df.Embarked.loc[(df.tic == '1') & (df.Pclass == 1)].mode().values[0]
guess_embark
# This code uses the mode() function to find the most common value in the 'Embarked' column for the specified condition. 
# The .values[0] part is used to extract the first value from the resulting Series.

**Note: It is more common to use the mode instead of the median when dealing with categorical variables like 'Embarked'.**

In [None]:
df.Embarked.fillna(guess_embark , inplace=True)

In [None]:
# check if there is any NAN
df.Embarked.isnull().sum(axis=0)

In [None]:
df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

**We can clearly observe that people have higher chance of survival if they belong to "C" <br> The survival rate does change between different Embarked values. However, it is due to the changes of other features. For example, people from Embarked = C are more likely to survive because they are generally richer (Pclass, Fare). People from Embarked = S has the lowest survival rate because it has the lowest fraction of female passengers, even though they are a bit richer than people from Embarked = Q.<br>DROP THIS FEATURE.**

In [None]:
df = df.drop(labels='Embarked', axis=1)

### **Age**

In [None]:
# check if there is any NAN
df.Age.isnull().sum(axis=0)

In [None]:
# visualize the correlation between Title and Age
grid = sns.FacetGrid(df, col='Title', aspect=0.8, sharey=False)
grid.map(plt.hist, 'Age', alpha=.5, bins=range(0,105,5))
plt.show()

In [None]:
# inspect the mean Age for each Title
df[['Title', 'Age']].groupby(['Title']).mean()

In [None]:
# visualize the correlation between Fare-bin and Age
grid = sns.FacetGrid(df, col='Fare-bin', aspect=0.8, sharey=False)
grid.map(plt.hist, 'Age', alpha=.5, bins=range(0,105,5))
plt.show()

In [None]:
# inspect the mean Age for each Fare-bin
df[['Fare-bin', 'Age']].groupby(['Fare-bin']).mean()

In [None]:
# visualize the correlation between SibSp and Age
grid = sns.FacetGrid(df, col='SibSp', col_wrap=4, aspect=0.8, sharey=False)
grid.map(plt.hist, 'Age', alpha=.5, bins=range(0,105,5))
plt.show()

In [None]:
# inspect the mean Age for each SibSp
df[['SibSp', 'Age']].groupby(['SibSp']).mean()

In [None]:
# visualize the correlation between Parch and Age
grid = sns.FacetGrid(df, col='Parch', col_wrap=4, aspect=0.8, sharey=False)
grid.map(plt.hist, 'Age', alpha=.5, bins=range(0,105,5))
plt.show()

In [None]:
# visualize the correlation between Parch and Age
grid = sns.FacetGrid(df, col='Parch', col_wrap=4, aspect=0.8, sharey=False)
grid.map(plt.hist, 'Age', alpha=.5, bins=range(0,105,5))
plt.show()

In [None]:
# inspect the mean Age for each Parch
df[['Parch', 'Age']].groupby(['Parch']).mean()

**The change of Age as a function of Title, Fare-bin, or SibSp is quite significant, so I'll use them to guess the missing values. I use a random forest regressor to do this.**

##### ***Random forest Regressor for Age**

In [None]:
# instead of using Title, we should use its corresponding dummy variables 
df_sub = df[['Age','Master','Miss','Mr','Mrs','Others','Fare-bin','SibSp']]

X_train  = df_sub.dropna().drop('Age', axis=1)
y_train  = df['Age'].dropna()
X_test = df_sub.loc[np.isnan(df.Age)].drop('Age', axis=1)

regressor = RandomForestRegressor(n_estimators = 300)
regressor.fit(X_train, y_train)
y_pred = np.round(regressor.predict(X_test),1)
df.Age.loc[df.Age.isnull()] = y_pred

The updated code uses the dummy variables corresponding to the **`Title`** feature instead of the original **`Title`** column. It selects the relevant columns **`('Age', 'Master', 'Miss', 'Mr', 'Mrs', 'Others', 'Fare-bin', 'SibSp')`** from the DataFrame df and assigns it to **`df_sub`**.

Next, it splits the data into the training and target variables. **`X_train`** (It is obtained by dropping rows with missing values (using **`dropna()`**) from the **`df_sub`**)consists of the columns from df_sub except for 'Age', while **`y_train`** contains the non-null values of 'Age'. The data for which 'Age' is null is stored in **`X_test`**((using np.isnan(df.Age) to identify null values) and excludes the 'Age' column (using drop('Age', axis=1)).)

A **RandomForestRegressor** model is then instantiated with 300 estimators and trained on the training data (X_train and y_train). The model is used to predict the missing 'Age' values in X_test, which are rounded to one decimal place and assigned to y_pred.

Finally, the missing 'Age' values in the original DataFrame df are replaced with the predicted values (y_pred). The code then checks for any remaining missing values in 'Age' using the isnull().sum() function, which returns the count of null values in the 'Age' column.

In [None]:
df.Age.isnull().sum(axis=0) # no more NAN now

**Since the age similar to Fare has multiple value we will create bins in order to feed the ML model**

In [None]:
bins = [ 0, 15, 30, 45, 60, 75, 90, 105] # This is somewhat arbitrary
age_index = (1,2,3,4,5,6,7) #('baby','child','teenager','young','mid-age','over-50','senior')
df['Age-bin'] = pd.cut(df.Age, bins, labels=age_index).astype(int)

In [None]:
df[['Age-bin', 'Survived']].groupby(['Age-bin'],as_index=False).mean()

## **Model and Prediction**

In [None]:
df

In [None]:
df.columns

In [None]:
df = df.drop(labels=['Age','Fare','Cabin','tic','Title',], axis=1)

In [None]:
df

In [None]:
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

**Now, our data is clean and preprocessed, we now can set back to dividing the dataset into training and testing dataset**

In [None]:
y_train = df[0:891]['Survived'].values # 891 training columns
X_train = df[0:891].drop(['Survived','PassengerId'], axis=1).values
X_test  = df[891:].drop(['Survived','PassengerId'], axis=1).values

In [None]:
model = Sequential()

# layers
model.add(Dense(11, kernel_initializer=glorot_uniform(seed=1), input_dim=19))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(9, kernel_initializer=glorot_uniform(seed=1)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(7, kernel_initializer=glorot_uniform(seed=1)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(5, kernel_initializer=glorot_uniform(seed=1)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(1, kernel_initializer=glorot_uniform(seed=1), activation='sigmoid'))

In [None]:
model.summary()

In [None]:
# To save the best model
checkpointer = ModelCheckpoint(filepath='weights.best.model.hdf5',verbose=2, save_best_only=True)

# Earlystopping
early_stop = EarlyStopping(monitor='val_loss', patience=5)

# To reduce learning rate dynamically
lr_reduction = ReduceLROnPlateau(monitor='val_loss',patience=3, verbose=2, factor=0.2) 

In [None]:
# Compiling the NN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the NN
model.fit(X_train, y_train, batch_size = 32, epochs = 200,callbacks=[checkpointer, lr_reduction ,early_stop])

In [None]:
y_pred = model.predict(X_test)
y_final = (y_pred > 0.5).astype(int).reshape(X_test.shape[0])

output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_final})
output.to_csv('submission.csv', index=False)