# Inputting & Importing

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import matplotlib
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from IPython.display import Markdown, display
import random

In [None]:
%config Completer.use_jedi = False
sns.set(rc={'figure.figsize':(18,10)})
sns.set_style({'axes.facecolor':'white', 'grid.color': '.8', 'font.family':'Times New Roman'})

# Colors
cyan = '#00FFD1'
red = '#FF007D'
prussian = '#0075FF'
green = '#EEF622'
yellow = '#FFF338'
violet = '#9B65FF'
orange = '#FFA500'
blue = '#00EBFF'
vermillion = '#FF6900'
red2 = '#FF2626'
seagreen = '#28FFBF'
green2 = '#FAFF00'
navyblue = '#04009A'
darkgreen = '#206A5D'
lightgreen = '#CCF6C8'
pink = '#F35588'
mauve = '#BAABDA'
lightblue = '#1CC5DC'
mustard = '#FDB827'
deeppurple = '#723881'

color_list = [cyan,red,prussian,green,violet,orange,yellow,blue,vermillion,red2,seagreen,green2,navyblue,darkgreen,lightgreen,pink,mauve,lightblue,mustard,deeppurple]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def printmd(string):
    display(Markdown(string))

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape, test.shape

In [None]:
# To analyse presence of any disparities and/or major factors
survived_df = train[train['Survived']==1]
deceased_df = train[train['Survived']==0]

# Data Preprocessing

## Missing Values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

- 20% **Age** values are missing in *Train Data*, 20% in *Test Data* as well
- 0.2% **Embarked** values are missing 
- 77% **Cabin** values are missing in *Train Data* -> ***might delete later but keeping for EDA insight***
- **Fare** has just 1 missing value in *Test Data*

### Cabin

In [None]:
# train = train.drop(['Cabin'],axis=1)
# test = test.drop(['Cabin'],axis=1)

### Age

In [None]:
# 20% Age values are missing
train['Age'].isnull().sum() 

In [None]:
sns.displot(data=train['Age'],kde=True,height=6.5,color=random.choice(color_list));

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(survived_df['Age'],label = 'Survived', shade = True, color=cyan)
sns.kdeplot(deceased_df['Age'],label = 'Deceased', shade = True, color=red)
plt.title('Age')
plt.xlabel('Age of Passengers')

In [None]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(train['Age'].mean())

### Embarked

In [None]:
# 0.2% Embarked values are missing
train['Embarked'].isnull().sum()

In [None]:
train['Embarked'].value_counts()

* **S** or *Southampton* is the **Mode**

In [None]:
train['Embarked'] = train['Embarked'].fillna('S')

### Fare

In [None]:
train.describe()

In [None]:
sns.displot(train['Fare'],bins=20,color=random.choice(color_list));

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(survived_df['Fare'],label = 'Survived', shade = True, color=cyan)
sns.kdeplot(deceased_df['Fare'],label = 'Deceased', shade = True, color=red)
plt.title('Fare')
plt.xlabel('Fare of Passengers')

In [None]:
test['Fare'] = test['Fare'].fillna(train['Fare'].mode()[0])

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### Cabin

In [None]:
train['Cabin'].isnull().sum()

In [None]:
nonNullCabin = train[~train['Cabin'].isnull()] # non null Cabin values

In [None]:
 # Non null cabin values that Survived
len(nonNullCabin[nonNullCabin['Survived']==1])

In [None]:
 # Non null cabin values that didn't  # Non null cabin values that Survive
len(nonNullCabin[nonNullCabin['Survived']==0])

In [None]:
survived_cabins = list(nonNullCabin[nonNullCabin['Survived']==1]['Cabin'].value_counts().index)
deceased_cabins = list(nonNullCabin[nonNullCabin['Survived']==0]['Cabin'].value_counts().index)

In [None]:
# Common Cabins
c=0
for x in survived_cabins:
    if(x in deceased_cabins):
        c=c+1
print(c)

In [None]:
print(survived_cabins)

In [None]:
print(deceased_cabins)

### SibSp and Parch

In [None]:
train['Family'] = train['SibSp']+train['Parch']
test['Family'] = test['SibSp']+test['Parch']
# train=train.drop(['SibSp','Parch'],axis=1)
# test=test.drop(['SibSp','Parch'],axis=1)

In [None]:
train.head()

# **Exploratory Data Analysis**

## **Ticket**

Here, I'll be creating new Dataframes (Test and Train each) to analyse the frequent occuring Ticket types and analayse each of those ticket groups individually

In [None]:
Ticket_temp_train = train['Ticket'].value_counts()
Ticket_temp_test = test['Ticket'].value_counts()

In [None]:
Ticket_temp_train_df = pd.DataFrame({'ticket':Ticket_temp_train.index,'freq':Ticket_temp_train.values})
Ticket_temp_test_df = pd.DataFrame({'ticket':Ticket_temp_test.index,'freq':Ticket_temp_test.values})

In [None]:
Ticket_temp_train_df.head(8)

In [None]:
train.set_index('PassengerId',inplace=True)

In [None]:
train.columns

In [None]:
train = train[['Survived','Name','Sex','Ticket','Age','Fare','Pclass','Embarked','Cabin','Family','SibSp', 'Parch']]

In [None]:
# This function creates a mechanism for us to parse different ranges of frequency (from the 'freq' column of the above
# newly created DataFrames) and obatin individual Tables for Analysis


def analyse_tickets(freq_to_stop_at,dataframe):
    flag = 'none'
    for i in range(0,len(Ticket_temp_train_df.iloc[:,:])): # iterating a number range
        
        ticket_name = Ticket_temp_train_df.iloc[i,0]
        ticket_freq = Ticket_temp_train_df.iloc[i,1]

        if(flag != ticket_freq and ticket_freq != freq_to_stop_at-1):
            flag=ticket_freq
            printmd('---')
            printmd('### **Ticket frequency:** **%d**'%(ticket_freq))
            print('\n')
            
    
        if (ticket_freq!=freq_to_stop_at-1):
            printmd(' #### *Ticket Name:* **%s**'%(ticket_name))
            display(dataframe.loc[dataframe['Ticket']==ticket_name])
            print('\n\n')# End of one group
        
        else:
            break
print('\n')
printmd('---')

#### **Among the following Table groups below, Please pay attention to:**
* How many **survived** in a group?
* Whether they're in the same family via **Family**and **Name**? (For details on family demographic, see **SipSp** and **ParCh**)
* The port they **Embarked** from
* Their **Age** demographic
* Whether they're in the same **cabin**. 

In [None]:
analyse_tickets(2,train) ## Enter frequency to stop at and dataframe to work with. For ex: (6,train)

#### **Observations**
* Some people have more than one **cabin**. Almost all of these people belong to the 1st **Class**.
* Some people not from the same **family** are in the same **cabin**.
* Should I **age** categorize?
* There are hardly any **cabin** names for both 2nd and 3rd **Class** passengers.
* 3rd **class** passengers usually have **cabins** in F and G (for the data that is present).
* Passengers on the same **ticket** are mostly in the same **cabin** and belong to the same **class**.
* Among couples (in the same cabin) from all **classes**, it was common to see **only women** surviving in a lot of cases.

Interesting read for the side: Berth numbers were given for some passengers. Odd for lower berths and even for upper berths. [source](https://www.encyclopedia-titanica.org/cabins.html)

#### Getting Ticket prefix values

In [None]:
train.loc[1,'Ticket']

In [None]:
train

In [None]:
# c = -1
# tick_1 = {}
# for i in range(0,len(train['Ticket'])):
#     c=c+1
#     match = re.search('^[a-zA-Z]+',train.iloc[i,3])
#     if (match):
#         tick_1[c] = match.group()

In [None]:
# tick1_s = pd.Series(tick_1)
# tick1_s.head()

In [None]:
tick_prefix_train = []
for i in range(0,len(train['Ticket'])):
    match = re.search('^[a-zA-Z]+',train.iloc[i,3])
    if (match):
        tick_prefix_train.append(match.group())
    else:
        tick_prefix_train.append('Null')
        
        
tick_prefix_test = []
for i in range(0,len(test['Ticket'])):
    match = re.search('^[a-zA-Z]+',test.iloc[i,7])
    if (match):
        tick_prefix_test.append(match.group())
    else:
        tick_prefix_test.append('Null')

In [None]:
train['Ticket_prefix'] = tick_prefix_train
test['Ticket_prefix'] = tick_prefix_test

In [None]:
train.head()

In [None]:
Ticket_pre_df = pd.DataFrame({'prefix':train['Ticket_prefix'].value_counts().index, 'freq':train['Ticket_prefix'].value_counts().values})
Ticket_pre_df.head(15)

In [None]:
def analyse_prefix(freq_to_stop_at,dataframe):
    # booll - enter True if you want null too
    flag = 'none'
    for i in range(1,len(Ticket_pre_df)): # iterating a number range
        ticket_name = Ticket_pre_df.iloc[i,0]
        ticket_freq = Ticket_pre_df.iloc[i,1]

        if(flag != ticket_freq):
            flag=ticket_freq
            printmd('---')
            printmd('### Ticket frequency: **%d**'%(int(ticket_freq)))
            
    
        if (ticket_freq!=freq_to_stop_at-1):
            printmd(' #### *Ticket Name:* **%s**'%(ticket_name))
            display(dataframe.loc[dataframe['Ticket_prefix']==ticket_name])
            print('\n\n')# End of one number
        
        else:
            break

In [None]:
analyse_prefix(11,train) # first arg doesn't work here ##change

**Grouping all unique tickets to a common value**

In [None]:
for i in range(0,len(Ticket_temp_train_df.iloc[:,:])):
    if (Ticket_temp_train_df.loc[i,'freq'] == 1):
        train['Ticket'] = train['Ticket'].replace([ Ticket_temp_train_df.loc[i,'ticket'] ],'UniqueTicketPrefix')
        
for i in range(0,len(Ticket_temp_test_df.iloc[:,:])):
    if (Ticket_temp_test_df.loc[i,'freq'] == 1):
        test['Ticket'] = test['Ticket'].replace([ Ticket_temp_test_df.loc[i,'ticket'] ],'UniqueTicketPrefix')

In [None]:
train.head()

In [None]:
train['Ticket'].value_counts()

### Name

In [None]:
name_titles_train = []
for i in range(0,len(train['Name'])):
    title = (train.iloc[i,1].split(', ')[1]).split(' ')[0]
    name_titles_train.append(title)


name_titles_test = []
for i in range(0,len(test['Name'])):
    title = (test.iloc[i,2].split(', ')[1]).split(' ')[0]
    name_titles_test.append(title)

In [None]:
train['Title'] = name_titles_train
test['Title'] = name_titles_test

In [None]:
train = train.drop(['Name'],axis=1)
test = test.drop(['Name'],axis=1)

In [None]:
train.head()

In [None]:
train['Title'].value_counts()

In [None]:
plt.figure(figsize=(25,17))
sns.countplot(x='Title',hue='Survived',data=train)

# Categorical Encoding

In [None]:
train.head(7)

In [None]:
# Categories

for i in (1,2,5,6,7,11,12):
    c = train.columns[i]
    printmd('### %s'%(c))
    display(train[c].value_counts())
    print(' ')

## Mean Encoding for **Ticket**, **Ticket_prefix** and **Title** columns

In [None]:
def Mean_Encoding(column_name):
    new_smooth_name = column_name+'_smean_encod'
    
    mean = train['Survived'].mean()
    agg= train.groupby(column_name)['Survived'].agg(['count','mean'])
    counts = agg['count']
    means = agg['mean']
    weight = 100
    smooth = (counts*means + weight*mean)/(counts+weight)
    
    train.loc[:,new_smooth_name] = train[column_name].map(smooth)
    test.loc[:,new_smooth_name] = test[column_name].map(smooth)    
    

In [None]:
Mean_Encoding('Ticket')

In [None]:
Mean_Encoding('Ticket_prefix')

In [None]:
Mean_Encoding('Title')

In [None]:
test.isnull().sum()

This means, there are new uniue values in the test dataset which weren't mapped to the smooth values we have here

### Missing values after mean Encoding

In [None]:
sns.displot(data=train['Ticket_smean_encod'],kde=True,height=6.5,color=random.choice(color_list));

In [None]:
sns.displot(data=train['Ticket_prefix_smean_encod'],kde=True,height=6.5,color=random.choice(color_list));

In [None]:
sns.displot(data=train['Title_smean_encod'],kde=True,height=6.5,color=random.choice(color_list));

In [None]:
test['Ticket_smean_encod'] = test['Ticket_smean_encod'].fillna(train['Ticket_smean_encod'].mean())
test['Ticket_prefix_smean_encod'] = test['Ticket_prefix_smean_encod'].fillna(train['Ticket_prefix_smean_encod'].mean())
test['Title_smean_encod'] = test['Title_smean_encod'].fillna(train['Title_smean_encod'].mean())

In [None]:
test.isnull().sum()

## One Hot Encoding for **Sex**, **Embarked** and **Pclass** columns

In [None]:
# Sex

train['Sex_female'] = pd.get_dummies(train.Sex, prefix='Sex')['Sex_female']
train['Sex_male'] = pd.get_dummies(train.Sex, prefix='Sex')['Sex_male']
test['Sex_female'] = pd.get_dummies(test.Sex, prefix='Sex')['Sex_female']
test['Sex_male'] = pd.get_dummies(test.Sex, prefix='Sex')['Sex_male']

In [None]:
# Pclass

train['Pclass_1'] = pd.get_dummies(train.Pclass, prefix='Pclass')['Pclass_1']
train['Pclass_2'] = pd.get_dummies(train.Pclass, prefix='Pclass')['Pclass_2']
train['Pclass_3'] = pd.get_dummies(train.Pclass, prefix='Pclass')['Pclass_3']

test['Pclass_1'] = pd.get_dummies(test.Pclass, prefix='Pclass')['Pclass_1']
test['Pclass_2'] = pd.get_dummies(test.Pclass, prefix='Pclass')['Pclass_2']
test['Pclass_3'] = pd.get_dummies(test.Pclass, prefix='Pclass')['Pclass_3']

In [None]:
# Embarked

train['Embarked_C'] = pd.get_dummies(train.Embarked, prefix='Embarked')['Embarked_C']
train['Embarked_Q'] = pd.get_dummies(train.Embarked, prefix='Embarked')['Embarked_Q']
train['Embarked_S'] = pd.get_dummies(train.Embarked, prefix='Embarked')['Embarked_S']

test['Embarked_C'] = pd.get_dummies(test.Embarked, prefix='Embarked')['Embarked_C']
test['Embarked_Q'] = pd.get_dummies(test.Embarked, prefix='Embarked')['Embarked_Q']
test['Embarked_S'] = pd.get_dummies(test.Embarked, prefix='Embarked')['Embarked_S']

In [None]:
train.columns

In [None]:
df_train = train[['Age','Fare','Ticket_prefix_smean_encod','Sex_female','Pclass_1','Pclass_2','Embarked_C',
       'Embarked_Q','Survived']] # omitted extra dummy variables
df_test = test[['Age','Fare','Ticket_prefix_smean_encod','Sex_female','Pclass_1','Pclass_2','Embarked_C',
       'Embarked_Q']] # omitted extra dummy variables

# Correlation

In [None]:
fig, ax = plt.subplots(figsize=(18,16)) 
my_c = sns.diverging_palette(20, 220, as_cmap=True)
mask = np.triu(df_train.corr())
sns.heatmap(df_train.corr(),cmap='BrBG',linewidths=1.5,ax=ax,annot=True,center=0,square=True,mask=mask)
plt.title('Correlation',fontsize=30);

In [None]:
train.head()