In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'hyperopt'

# About the dataset :

In [None]:
#Read data
data = pd.read_csv('train.csv')

#Printing first 5 rows
data[:5]

In [None]:
data.info()

About the features : 
- **PassengerId** : Id of each passsenger (in int)
- **Survived** : If survived or no (0 = No, 1 = Yes)
- **Pclass** : Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- **Name** : name (string)
- **Sex** : sex (string : 'male' 'female')
- **Age** : age in years
- **SibSp** : # of siblings / spouses aboard the Titanic
- **Parch** : # of parents / children aboard the Titanic
- **Ticket** : Ticket number (string)
- **Fare** : Passenger fare
- **Cabin** : Cabin number (string)
- **Embarked** : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) 

## Is the data imbalanced?

In [None]:
data['Survived'].value_counts()

In [None]:
sns.set()
sns.set(rc = {'figure.figsize':(9,5)})
sns.countplot(x="Survived",data=data)

- **Data is clearly imbalanced**, the count of people who survived **is almost close to half** the count of people who didn't survive 

## Correlation with target variables

In [None]:
data.corrwith(data["Survived"])

In [None]:
plt.bar(dict(data.corrwith(data["Survived"])).keys(), dict(data.corrwith(data["Survived"])).values())

# Missing values?

In [None]:
#Checking for Nan values in features excluding PassengerId, Name
print('Nan values exist in')
for col in [data.columns[i] for i in range(0, len(data.columns))]: #[1,2,4,5,6,7,8,9,10,11]]:
    print(col,':', any(data[col].isna()))

- 'Age', 'Cabin' and 'Embarked' have Nan values. Lets check them further..

## Age 

In [None]:
data['Age'] = data['Age'].fillna('nan')

print(data[data['Age'] == 'nan']['Age'])

- There are 177 Nan values in the Age column

- Now, we impute the missing values with a central tendency measure (median performed the best in this case)

In [None]:
def impute_missing_data(test, col, median, mode):
    if col in test :
        test[col] = test[col].fillna('nan')

        test_feature = test.drop(test[test[col]== 'nan'].index, axis= 0)[col]

        nan_index = test[test[col]== 'nan'].index

        for i in nan_index:
            if median == True:
                test[col].iloc[i] = test_feature.median()
            elif mode == True:
                test[col].iloc[i] = test_feature.value_counts().index[0]
        
    return test

In [None]:
#Imputing mean values for missing data
data = impute_missing_data(data, 'Age', median =True, mode=False)

In [None]:
any(data['Age'].isna())

In [None]:
data['Age'] = data['Age'].map(lambda x: int(x))

In [None]:
'''data = data.drop(data[data['Age']== 'nan'].index, axis= 0)
data.info()'''

In [None]:
data.corrwith(data["Survived"])

## Cabin

In [None]:
data['Cabin'] = data['Cabin'].fillna('nan')

print(data[data['Cabin'] == 'nan']['Cabin'])

- Since 'Cabin' has around 529 nan values which is more than 50 percent of the total data, we drop this feature

In [None]:
data['cabin_nan'] = [1 if data['Cabin'].iloc[i] =='nan' else 0 for i in range(len(data))]
#data['cabin_val'] = [0 if data['Cabin'].iloc[i] =='nan' else 1 for i in range(len(data))]


## Embarked

In [None]:
data['Embarked'] = data['Embarked'].fillna('nan')

print(data[data['Embarked'] == 'nan']['Embarked'])

- Only two nan values, we impute them with the maximum occuring category for the feature or the mode

In [None]:
#Imputing the mode values for missing data
data = impute_missing_data(data, 'Embarked', median = False, mode = True)

In [None]:
any(data['Embarked'].isna())

In [None]:
data

In [None]:
data.corrwith(data["Survived"])

In [None]:
plt.bar(dict(data.corrwith(data["Survived"])).keys(), dict(data.corrwith(data["Survived"])).values())

# EDA

## Age of the people who survived?

Three visualizations :
1. Boxplots : To tell us about the distribution of the feature for each output class and give us an idea about the outliers in the data
2. Density plots : To tell us about the distribution and more about the shape of the distribution (resembling normal or any other disb) of the feature
3. Stacked countplots : Tells us more about count of each category of the feature with information about the percentage belonging to which output class

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="Age", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'Age', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

- Distribution of people who didn't survive is slightly right skewed, or the mean of the people who did not survive > median of the distribution which is around ~27, i.e. **mean age of the people who did not survive is higher than 27**
- People of ages > ~67 are considered outliers in the disb of people who did not survive. owing to the fact **people above the age of ~67 are very less** and people of age ~60 above are outliers in the disb of people who did survive, i.e. **very few people above 60 actually did survive**

In [None]:
def plot_stacked_plot_percentages(class_list, df_p, add_x_val):
    
    '''This function adds the percentage in each stacked plot
    class_list : contains three params, count of each rect/class, total for that rect/class, ax object of that rect/class
    df_p : df.axes.patches object for each rectange
    add_x_val : x axis value to center the text
    '''
    
    for class_val in class_list:
        
        percentage = (class_val[0]/class_val[1])*100
        height = class_val[2].get_height() 
        df_p.axes.text(class_val[2].get_x()+add_x_val, height,"{:1.1f}%".format(percentage))

In [None]:
def calculate_stacked_plot_percentages(df_plot, df_p):
    #Find the counts for each age bin for calcualting percentage
    df_plot_sum = df_plot.sum(axis=1)
    class_sum = np.array([df_plot_sum.iloc[int(i/2)] if i%2==0 else df_plot_sum.iloc[int((i-1)/2)] for i in range(2*len(df_plot_sum)) ]).reshape((len(df_plot_sum),2)).T.flatten()
    class_list = zip(np.roll(df_plot.to_numpy().T.flatten(), len(df_plot)), class_sum, list(df_p.axes.patches))

    return class_list

In [None]:
#Stacked countplot : https://stackoverflow.com/questions/50319614/count-plot-with-stacked-bars-per-hue #Add percentages on stacked countplots : https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn
sns.set(rc = {'figure.figsize':(15,8)})

#Find the bins
data['age_bin'] = pd.cut(data['Age'], 3, precision = 0)

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'age_bin']).size().reset_index().pivot(columns='Survived', index='age_bin', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("Age groups", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.12)

- In the above stacked plots, the age has been binned into 3 age bins, Bin1 or the younger people in age group 0-27 years, Bin 2 or the iddle aged in age group 27-53 years, and Bin3 or the older people in age group 53 -80 years
- Majority of people fall in age bin Bin 1 [0-27], and Bin 2 [27 - 53] while a minority fall in Bin3 [53-80]
- The percentage of people who survived is **slightly lower in Bin 1 and Bin 2** (40.8 and 37.5 % respectively), while the **percentage of people who survived is much lower in Bin 3 (32%)**

Note : Countplots vs box plots

In [None]:
#how to drop outliers from data after analysing with boxplot : https://datascience.stackexchange.com/questions/54808/how-to-remove-outliers-using-box-plot
def drop_outliers(data , col):
    Q1 = data['Age'].quantile(0.25)
    Q3 = data['Age'].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 

    filter = (data['Age'] >= Q1 - 1.5 * IQR) & (data['Age'] <= Q3 + 1.5 *IQR) #filter filters out data within Q1 - 1.5*IQR to Q3 + 1.5*IQR
    data = data.loc[filter]
    return data

In [None]:
#data = drop_outliers(data , 'Age')
#data

- We have 825 rows now that 66 rows containing age outliers were removed

## Sex of the surviors ?

In [None]:
#Stacked countplot : https://stackoverflow.com/questions/50319614/count-plot-with-stacked-bars-per-hue #Add percentages on stacked countplots : https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn
sns.set(rc = {'figure.figsize':(15,8)})

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'Sex']).size().reset_index().pivot(columns='Survived', index='Sex', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("Sex", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.2)

- From the above stacked plots, it is clear **more percentage of female passengers survived** i.e. 74.2 % while only 18.9 % of male passengers survived, even though more male passesngers were aboard

In [None]:
#data = drop_outliers(data , 'Sex')
#data

## Fares of the survivors?

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="Fare", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'Fare', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

- Distribution of fares of survivors clearly has a higher median value, hence **passengers who survived had higher average fares**
- Lots of outliers in the distribution of both
- A huge spike in the pdf of the people who didn't survive, or the probability that the person who didn't survive had a fare of ~10 units of currency is very high

In [None]:
#data = drop_outliers(data , 'Fare')
#data

### Converting fares into categorical variables

In [None]:
def fares_categorical(data):
    data['fare_bin'] = pd.cut(data['Fare'], 3, precision = 0)
    return data

data = fares_categorical(data)
map_obj = {(-1.0, 171.0) : 0, (171.0, 342.0): 1,(342.0, 512.0):2}
data['fare_bin'] = data['fare_bin'].map(lambda x: map_obj[(x.left, x.right)])

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="fare_bin", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
#plt.subplot(1,2,2)
#dp_ax = sns.kdeplot(x = 'fare_bin', hue = 'Survived', data = data)
#dp_ax.set_title('Density plot', fontsize = 20)

## Port embarked of survivors?

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'Embarked']).size().reset_index().pivot(columns='Survived', index='Embarked', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("Embarked", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.2)

## Name of the survivors

### Name title

In [None]:
data['Name'].value_counts()

In [None]:
data_name_title_1 = data[data['Name'].str.contains('Miss')]['Name']
data_name_title_2 = data[data['Name'].str.contains('Mrs')]['Name']
data_name_title_3 = data[data['Name'].str.contains('Master')]['Name']
data_name_title_4 = data[data['Name'].str.contains('Mr')]['Name']
data_name_title_5 = data[~ (data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master'))]['Name']

In [None]:
data_name_title_5.value_counts()

In [None]:
def name_encoding(data):
    data['name_title'] = [np.nan for i in range(len(data))]

    data_name_title_1 = data[data['Name'].str.contains('Miss')]['Name']
    data_name_title_2 = data[data['Name'].str.contains('Mrs')]['Name']
    data_name_title_3 = data[data['Name'].str.contains('Master')]['Name']
    data_name_title_4 = data[data['Name'].str.contains('Mr')]['Name']
    data_name_title_5 = data[~ (data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master'))]['Name']
    
    for i in range(len(data)):
        if data['Name'].iloc[i] in list(data_name_title_1):
            data['name_title'].iloc[i] = 0
        elif data['Name'].iloc[i] in list(data_name_title_2):
            data['name_title'].iloc[i] = 1
        elif data['Name'].iloc[i] in list(data_name_title_3):
            data['name_title'].iloc[i] = 2
        elif data['Name'].iloc[i] in list(data_name_title_4):
            data['name_title'].iloc[i] = 3
        elif data['Name'].iloc[i] in list(data_name_title_5):
            data['name_title'].iloc[i] = 4
        
    return data

In [None]:
data = name_encoding(data)

In [None]:
data['name_title'].value_counts()

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'name_title']).size().reset_index().pivot(columns='Survived', index='name_title', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("name_title", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.2)

### Name title (without m/f)

In [None]:
data_name_title_1 = data[data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master')]['Name']
data_name_title_2 = data[~ (data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master'))]['Name']

In [None]:
data_name_title_2.value_counts()

In [None]:
def name_encoding_1(data):
    data['name_title_1'] = [np.nan for i in range(len(data))]
    data_name_title_1 = data[data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master')]['Name']
    data_name_title_2 = data[~ (data['Name'].str.contains('Miss') | data['Name'].str.contains('Mrs') | data['Name'].str.contains('Mr') | data['Name'].str.contains('Master'))]['Name']    
    for i in range(len(data)):
        if data['Name'].iloc[i] in list(data_name_title_1):
            data['name_title_1'].iloc[i] = 0
        elif data['Name'].iloc[i] in list(data_name_title_2):
            data['name_title_1'].iloc[i] = 1
        
    return data

In [None]:
data = name_encoding_1(data)

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="name_title_1", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'name_title_1', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

### Name length

In [None]:
data['name_length'] = data['Name'].map(lambda x: len(str(x)))

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="name_length", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'name_length', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

In [None]:
def name_length_categorical(data):
    data['name_length_bin'] = pd.cut(data['name_length'], 3, precision = 0)
    return data

In [None]:
data = name_length_categorical(data)
data['name_length_bin'].value_counts().keys()[1]

In [None]:
map_obj = {(data['name_length_bin'].value_counts().keys()[0].left, data['name_length_bin'].value_counts().keys()[0].right) : 0, (data['name_length_bin'].value_counts().keys()[1].left, data['name_length_bin'].value_counts().keys()[1].right): 1,(data['name_length_bin'].value_counts().keys()[2].left, data['name_length_bin'].value_counts().keys()[2].right):2}
data['name_length_bin'] = data['name_length_bin'].map(lambda x: map_obj[(x.left, x.right)])

### Name vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
'''
vectorizer = CountVectorizer()
X1 = vectorizer.fit_transform(data['Name'])
data_name_vectorized = pd.DataFrame(data['Name'].map(lambda x : X1.toarray()[list(data['Name']).index(x)]))'''

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
test_name = test['Name']
name_list = pd.concat([test_name, data['Name']], axis = 0)

In [None]:
#Reference : https://www.kaggle.com/slythe/infamous-titanic-80-accuracy
vectorizer = CountVectorizer()
name_v = vectorizer.fit(name_list)

In [None]:
def name_vectorizer(data):
    if 'Name' in data.columns:
        #data['name_vectorized'] = data['Name'].map(lambda x : X1.toarray()[list(data['Name']).index(x)].sum())
        name_v = vectorizer.transform(data["Name"])
        name_vector_df = pd.DataFrame(data = name_v.todense(), columns = vectorizer.get_feature_names())
        data = pd.concat([data,name_vector_df], axis =1 )
    return data

In [None]:
#data = name_vectorizer(data)

In [None]:
data

In [None]:
'''sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="name_vectorized", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'name_vectorized', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)'''

## Letter of surname

In [None]:
data['Name'].value_counts()

In [None]:
highest = {}
for letter in ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]:
    data_name_1 = data['Name'][data['Name'].str.startswith(letter)]
    highest[letter] = len(data_name_1)

In [None]:
highest = sorted(highest.items(), key = lambda x:x[1], reverse = True)
print('Maximum occurence of letters in surname beginning :',highest[:5])

In [None]:
def surname_encoding(data):
    data['surname'] = [np.nan for i in range(len(data))]
    
    data_surname_1 = data['Name'][data['Name'].str.startswith("S")]
    data_surname_2 = data['Name'][data['Name'].str.startswith("M")]
    data_surname_3 = data['Name'][data['Name'].str.startswith("B")]
    data_surname_4 = data['Name'][data['Name'].str.startswith("C")]
    data_surname_5 = data['Name'][data['Name'].str.startswith("H")]
    data_surname_6 = data['Name'][~ (data['Name'].str.startswith("S") | data['Name'].str.startswith("M") | data['Name'].str.startswith("B")| data['Name'].str.startswith("C")| data['Name'].str.startswith("H"))]
    
    for i in range(len(data)):
        if data['Name'].iloc[i] in list(data_surname_1):
            data['surname'].iloc[i] = 0
        elif data['Name'].iloc[i] in list(data_surname_2):
            data['surname'].iloc[i] = 1
        elif data['Name'].iloc[i] in list(data_surname_3):
            data['surname'].iloc[i] = 2
        elif data['Name'].iloc[i] in list(data_surname_4):
            data['surname'].iloc[i] = 3
        elif data['Name'].iloc[i] in list(data_surname_5):
            data['surname'].iloc[i] = 4
        elif data['Name'].iloc[i] in list(data_surname_6):
            data['surname'].iloc[i] = 5

        
    return data

In [None]:
data = surname_encoding(data)

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="surname", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'surname', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

## Tickets of survivors?

In [None]:
data['Ticket'].value_counts()

- There are a total of 681 different ticket types.
- For starters, lets try to form categories based on the the presence of a digit in the string

In [None]:
data_ticket_1 = data[data['Ticket'].str.isdigit() == True]['Ticket']
print(data_ticket_1)

- There are 661 tickets that contain only numbers

In [None]:
data_ticket_2 = data[data['Ticket'].str.isdigit() == False]['Ticket']
print(data_ticket_2.value_counts())

- There are 167 tickets that contain a string plus numbers. For these, we categorise them based on the letter the ticket name starts with, namely 'C', 'P' and 'S' since these are the most common

In [None]:
data_ticket_2_1 = data_ticket_2[data_ticket_2.str.startswith("C")]
print(data_ticket_2_1, len(data_ticket_2_1))

In [None]:
data_ticket_2_2 = data_ticket_2[data_ticket_2.str.startswith("S")]
print(data_ticket_2_2, len(data_ticket_2_2))

In [None]:
data_ticket_2_3 = data_ticket_2[data_ticket_2.str.startswith("P")]
print(data_ticket_2_3, len(data_ticket_2_3))

In [None]:
data_ticket_2_4 = data_ticket_2[~ (data_ticket_2.str.startswith("C") | data_ticket_2.str.startswith("S") | data_ticket_2.str.startswith("P"))]
print(data_ticket_2_4, len(data_ticket_2_4))

In [None]:
def ticket_bin_encoding(data):
    data['ticket_bin'] = [np.nan for i in range(len(data))]
    
    data_ticket_1 = data[data['Ticket'].str.isdigit() == True]['Ticket']
    data_ticket_2 = data[data['Ticket'].str.isdigit() == False]['Ticket']
    #data_ticket_2_1 = data_ticket_2[data_ticket_2.str.startswith("C")]
    #data_ticket_2_2 = data_ticket_2[data_ticket_2.str.startswith("P")]
    #data_ticket_2_3 = data_ticket_2[data_ticket_2.str.startswith("S")]
    #data_ticket_2_4 = data_ticket_2[~ (data_ticket_2.str.startswith("C") | data_ticket_2.str.startswith("S") | data_ticket_2.str.startswith("P"))]
    
    for i in range(len(data)):
        if data['Ticket'].iloc[i] in list(data_ticket_1):
            data['ticket_bin'].iloc[i] = 0
        elif data['Ticket'].iloc[i] in list(data_ticket_2):
            data['ticket_bin'].iloc[i] = 1
        #elif data['Ticket'].iloc[i] in list(data_ticket_2_1):
        #    data['ticket_bin'].iloc[i] = 1
        #elif data['Ticket'].iloc[i] in list(data_ticket_2_2):
        #    data['ticket_bin'].iloc[i] = 2
        #elif data['Ticket'].iloc[i] in list(data_ticket_2_3):
        #    data['ticket_bin'].iloc[i] = 3
        #elif data['Ticket'].iloc[i] in list(data_ticket_2_4):
        #    data['ticket_bin'].iloc[i] = 4
        
        
    return data

In [None]:
data = ticket_bin_encoding(data)

In [None]:
data['ticket_bin'].value_counts()

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'ticket_bin']).size().reset_index().pivot(columns='Survived', index='ticket_bin', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("ticket_bin", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.2)

## Cabin of the survivors

In [None]:
data['Cabin'].value_counts()

In [None]:
def cabin_encoding(data):
    data['cabin_bin'] = [np.nan for i in range(len(data))]
    
    data_cabin_1 = data['Cabin'][data['Cabin'].str.startswith("A")]
    data_cabin_2 = data['Cabin'][data['Cabin'].str.startswith("B")]
    data_cabin_3 = data['Cabin'][data['Cabin'].str.startswith("C")]
    data_cabin_4 = data['Cabin'][data['Cabin'].str.startswith("D")]
    data_cabin_5 = data['Cabin'][data['Cabin'].str.startswith("E")]
    data_cabin_6 = data['Cabin'][~ (data['Cabin'].str.startswith("A") | data['Cabin'].str.startswith("B") | data['Cabin'].str.startswith("C")| data['Cabin'].str.startswith("D")| data['Cabin'].str.startswith("E"))]
    
    for i in range(len(data)):
        if data['Cabin'].iloc[i] in list(data_cabin_1):
            data['cabin_bin'].iloc[i] = 0
        elif data['Cabin'].iloc[i] in list(data_cabin_2):
            data['cabin_bin'].iloc[i] = 1
        elif data['Cabin'].iloc[i] in list(data_cabin_3):
            data['cabin_bin'].iloc[i] = 2
        elif data['Cabin'].iloc[i] in list(data_cabin_4):
            data['cabin_bin'].iloc[i] = 3
        elif data['Cabin'].iloc[i] in list(data_cabin_5):
            data['cabin_bin'].iloc[i] = 4
        elif data['Cabin'].iloc[i] in list(data_cabin_6):
            data['cabin_bin'].iloc[i] = 5
        elif data['Cabin'].iloc[i] == 'nan':
            data['cabin_bin'].iloc[i] = 6

    return data

In [None]:
data = cabin_encoding(data)

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="cabin_bin", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'cabin_bin', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

## Number of family members of the survivors ? (SibSp, Parch)

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="SibSp", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'SibSp', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="Parch", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'Parch', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

- Clearly, people who had more family aboard had a higher chance of survival

In [None]:
data['family'] = data['SibSp'] + data['Parch']

In [None]:
sns.set(rc = {'figure.figsize':(16,6)})
plt.plot(figure_size = (1,2))

#Boxplots
plt.subplot(1,2,1)
bp_ax = sns.boxplot(x = "Survived",y="family", data = data)
bp_ax.set_title('Box plot', fontsize = 20)

#Density plots
plt.subplot(1,2,2)
dp_ax = sns.kdeplot(x = 'family', hue = 'Survived', data = data)
dp_ax.set_title('Density plot', fontsize = 20)

## Ticket class of the surivors ? (Pclass)

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})

#Calculate the counts of categorical variable Survived for different age bins
df_plot = data.groupby([ 'Survived', 'Pclass']).size().reset_index().pivot(columns='Survived', index='Pclass', values=0)
print(df_plot)

#Plot the stacked bar plot
df_p = df_plot.plot(kind='bar', stacked=True)

#Set labels
df_p.set_xlabel("Pclass", fontsize = 20)
df_p.set_ylabel("Count", fontsize = 20)

class_list = calculate_stacked_plot_percentages(df_plot, df_p)

plot_stacked_plot_percentages(class_list, df_p, 0.2)

- Around 500 passengers belonged to Class 3 which is the majority of the passengers and 75 % of them did not survive
- Class 1 and Class 2 had around half of the total Class 1 count but higher percentage of survivors
- Clearly, **percentage of Class 1 and Class 2 survivors is higher (63 % and 47 %) than the percentage of Class 3 survivors (24 %)**

# Data preprocessing

In [None]:
data.columns

In [None]:
def combine_categorical_vars(data, col1,col2):
    #Get only the required columns from the dataframe
    cols = [col1, col2]
    one_hot_df = data.drop(columns = [col for col in data.columns if col not in cols])
    
    #Get the one hot encodings
    one_hot_df = pd.get_dummies(one_hot_df, columns = one_hot_df.columns)
    
    #Get the unique values mapped to integers
    arr = [int("".join(str(i) for i in List),2) for List in np.array(one_hot_df)] #Convert binary values to int    
    map_dict = {}
    for index, j in enumerate(set(arr)): #Map each int value to index (0,1,..)
        map_dict[j] = index
    
    arr = list(map(lambda x: map_dict[x], arr))
    
    #Get the final labels
    data[col1+'_'+col2] = arr# np.argmax(np.array(one_hot_df), axis =1)
    return data

In [None]:
data = combine_categorical_vars(data, 'Pclass','fare_bin')
data = combine_categorical_vars(data, 'Pclass_fare_bin', 'cabin_bin')
data = combine_categorical_vars(data, 'Sex','name_title')

data

In [None]:
drop_list = ['Survived','PassengerId', 'Name', 'Ticket', 'Cabin', 'age_bin',  'SibSp', 'Parch','name_title','name_title_1', 'fare_bin', 'Pclass_fare_bin','name_length']
#drop_list = ['Survived','PassengerId', 'Name', 'Ticket', 'Cabin', 'age_bin',  'SibSp', 'Parch']

X = data.drop(columns =drop_list)
y = data['Survived']

print(X.columns)

In [None]:
cols_dummies = ['ticket_bin', 'surname', 'name_length_bin', 'Embarked','cabin_nan','Pclass']
#cols_dummies = ['ticket_bin', 'surname','Pclass', 'Embarked', 'Pclass_fare_bin_cabin_bin', 'Sex_name_title']
X = pd.get_dummies(X, columns = cols_dummies)

In [None]:
categorical_features = ['Sex', 'pClass', 'Embarked' ]
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'family','name_length']
for col in categorical_features:
    if col in X.columns:
        try:
            le = LabelEncoder()
            X[col]= le.fit_transform(X[col])
        except:
            pass

for col in numerical_features:
    le = MinMaxScaler()
    if col in X.columns:
        X[col]= le.fit_transform(np.array(X[col]).reshape(-1,1))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Building the model
extra_tree_forest = ExtraTreesClassifier(n_estimators = 100)
  
# Training the model
extra_tree_forest.fit(X,y)
  
# Computing the importance of each feature
feature_importance = extra_tree_forest.feature_importances_
  
# Normalizing the individual importances
feature_importance_normalized = np.std([tree.feature_importances_ for tree in 
                                        extra_tree_forest.estimators_],
                                        axis = 0)

In [None]:
ind = np.argsort(feature_importance_normalized)[::-1]   #indices sorted in descending order
X_val = [X.columns[ind[i]] for i in range(0,7)]
y_val = [feature_importance_normalized[ind[i]] for i in range(0,7)]

In [None]:
plt.figure(figsize = (10,10))
plt.barh(X_val, y_val, color = ['b', 'g', 'r', 'c', 'm', 'y', 'k'])
plt.xlabel('Feature Labels')
plt.ylabel('Normalized Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.show()

In [None]:
#X = X.drop(columns = [cols for cols in X.columns if cols not in X_val])

In [None]:
X.columns

In [None]:
#Train test stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1)

In [None]:
sns.set(rc = {'figure.figsize':(15,15)})
sns.heatmap(X.corr(),
            #vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(50, 500, n=500),
            square=True,
            annot=True)

- name_title and Sex
- combining fare, cabin_bin and Pclass

In [None]:
X_train[:5]

In [None]:
'''def categorical_encoding(X_train, X_test, col):
    if col in X_train.columns:
        le = LabelEncoder()
        X_train[col]= le.fit_transform(X_train[col])
        X_test[col]= le.transform(X_test[col])
    
    return X_train, X_test


def normalize(X_train, X_test, col):
    if col in X_train.columns:
        le = MinMaxScaler()
        X_train[col]= le.fit_transform(np.array(X_train[col]).reshape(-1,1))
        X_test[col]= le.transform(np.array(X_test[col]).reshape(-1,1))
    
    return X_train, X_test'''

In [None]:
'''categorical_features = ['Sex', 'pClass', 'Embarked' ]
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'family']

for col in categorical_features:
    X_train, Xtest = categorical_encoding(X_train, X_test, col)

for col in numerical_features:
    X_train, Xtest = normalize(X_train, X_test, col)
print(X_train[:5], X_test[:5])'''

## Correlation w target variable

In [None]:
X = data
categorical_features_ = ['Sex', 'pClass', 'Embarked' ]
numerical_features_ = ['Age', 'SibSp', 'Parch', 'Fare']

for col in categorical_features_:
    if col in X.columns:
        le = LabelEncoder()
        X[col]= le.fit_transform(X[col])    

for col in numerical_features_:
    if col in X.columns:
        le = MinMaxScaler()
        X[col]= le.fit_transform(np.array(X[col]).reshape(-1,1))

In [None]:
sns.set(rc = {'figure.figsize':(14,14)})
sns.heatmap(X.corr(),
            #vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(50, 500, n=500),
            square=True,
            annot=True)

- name_title is highly negatively correlated with the output class
- Fare and Pclass are also highly negatively correlated
- Pclass and the output class are moderately negatively correlated

### Visualizing

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

data_1000 = X_train
labels_1000 = y_train

model = TSNE(n_components = 2, random_state = 0)
# configuring the parameteres
# the number of components = 2
# default perplexity = 30
# default learning rate = 200
# default Maximum number of iterations
# for the optimization = 1000

tsne_data = model.fit_transform(data_1000)

# creating a new data frame which
# help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_1000)).T
tsne_df = pd.DataFrame(data = tsne_data,columns =("Dim_1", "Dim_2", "label"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue ="label", size = 6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()

plt.show()

# Baseline Model

For baseline model to compare further models with, i used a RandomForest model since it is a simple, robust and high performing classification algorithm 

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
skf = KFold(n_splits=15, shuffle=True)

for train_idx, test_idx in skf.split(X_train, y_train):
    #print(train_idx, test_idx)
    #print(data.iloc[train_idx], y.iloc[train_idx])
    rf = RandomForestClassifier(n_estimators = 500)
    rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    print('Accuracy for th fold CV is', accuracy_score(rf.predict(X_train.iloc[test_idx]), y_train.iloc[test_idx]))

In [None]:
rf = RandomForestClassifier(n_estimators = 500)
rf.fit(X_train, y_train)
print('Final accuracy score :',  accuracy_score(rf.predict(X_test), y_test))

## Hyperparameter optimization using hyperopt

In [None]:
#Reference : https://www.kaggle.com/sivasaiyadav8143/10-hyperparameter-optimization-frameworks#3.-Hyperopt ; https://www.kaggle.com/maxdiazbattan/titanic-top-3-eda-f-eng-avg-6-models-optuna
criterion_list = ["gini", "entropy"]

def hyperopt_train_test(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,5)),
    'n_estimators': hp.choice('n_estimators', range(1,1000)),
    'criterion': hp.choice('criterion', criterion_list)
            }
best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
      best = acc
      print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=10, trials=trials)
print(best)

In [None]:
best

In [None]:
#rf = RandomForestClassifier(criterion = 'gini', max_depth= 10, n_estimators= 500)
rf = RandomForestClassifier(criterion = criterion_list[best['criterion']], max_depth= best['max_depth'], n_estimators= best['n_estimators'])
rf.fit(X_train, y_train)
print('Final accuracy score :',  accuracy_score(rf.predict(X_test), y_test))

# Model 1: Logistic Regression, GLM

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
'''penalty_list = ['l1', 'l2', 'elasticnet', 'none']

def hyperopt_train_test(params):
    clf = LogisticRegression(**params)
    return cross_val_score(clf, X_train, y_train).mean()

space = {
    'penalty': hp.choice('penalty', penalty_list),
    'C': hp.choice('C', range(0,100))
            }
best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
      best = acc
      print( 'new best:', best, params)
    return {'loss': -acc ,'status': STATUS_OK}
trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=300, trials=trials)
print(best)'''

In [None]:
'''best'''

In [None]:
'''lr = LogisticRegression(C = best['C'], penalty = penalty_list[best['penalty']])
lr.fit(X_train, y_train)
print('Final accuracy score :',  accuracy_score(lr.predict(X_test), y_test))'''

# Model 2 : Generative Additive Models

In [None]:
!pip install pygam
from pygam import LogisticGAM
import pygam

In [None]:
gam = LogisticGAM().fit(X_train, y_train)
gam.summary()

In [None]:
gam.accuracy(X_test, y_test)

In [None]:
fig, axs = plt.subplots(1, len(X_train.columns))
titles = X_train.columns

for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i)
    pdep, confi = gam.partial_dependence(term=i, width=.95)

    ax.plot(XX[:, i], pdep)
    ax.plot(XX[:, i], confi, c='r', ls='--')
    ax.set_title(titles[i]);

# Model 3: Ensembles

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
criterion_list = ["gini", "entropy"]

def hyperopt_train_test(params):
    clf = XGBClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

space = {
    'learning_rate' : hp.choice('learning_rate', [0.001,0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 0.6,1]),
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,5)),
    'n_estimators': hp.choice('n_estimators', range(1,1000)),
    'subsample' : hp.choice('subsample', [0,0.5,1])
            }
best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
      best = acc
      print( 'new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=10, trials=trials)
print(best)

In [None]:
best

In [None]:
if best['subsample'] >=1:
    best['subsample'] = 1
xgb = XGBClassifier(subsample = best['subsample'], learning_rate = best['learning_rate'], max_depth= best['max_depth'], n_estimators= best['n_estimators'], objective= 'binary:logistic')
xgb.fit(X_train, y_train)
print('Final accuracy score :',  accuracy_score(xgb.predict(X_test), y_test))

# Voting Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor, VotingClassifier, VotingRegressor

In [None]:
'''vc = VotingClassifier([("xgb_m",xgb),
                       ("log",lr),
                       ("rf_m",rf)],
                      voting = "hard")

vc.fit(X_train,y_train)'''

In [None]:
'''print('Final accuracy score :',  accuracy_score(vc.predict(X_test), y_test))'''

# Test predictions

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
test

In [None]:
print(test.columns)

In [None]:
#categorical_features = ['Sex', 'pClass', 'Embarked' ]
#numerical_features = ['Age', 'SibSp', 'Parch', 'Fare', 'family', 'name_length']

def clean_data(test):
    for col in [test.columns[i] for i in [1,3,4,5,6,7,8,10]]:
        if any(test[col].isna()) == True and col in ['Age', 'Fare']:
            test = impute_missing_data(test, col, median =True, mode=False)

                
    return test


def preprocess(test):
    test['Cabin'] = test['Cabin'].fillna('nan')
    test['cabin_nan'] = [1 if test['Cabin'].iloc[i] =='nan' else 0 for i in range(len(test))]

    test = ticket_bin_encoding(test)
    test = name_encoding(test)
    test['family'] = test['SibSp'] + test['Parch']
    test['name_length'] = test['Name'].map(lambda x: len(str(x)))
    #test = name_vectorizer(test)
    test = cabin_encoding(test)
    test = name_encoding_1(test)
    test = surname_encoding(test)
    test = fares_categorical(test)
    map_obj = {(-1.0, 171.0) : 0, (171.0, 342.0): 1,(342.0, 512.0):2}
    test['fare_bin'] = test['fare_bin'].map(lambda x: map_obj[(x.left, x.right)])
    test = combine_categorical_vars(test, 'Pclass','fare_bin')
    test = combine_categorical_vars(test, 'Pclass_fare_bin', 'cabin_bin')
    test = combine_categorical_vars(test, 'Sex','name_title')
    test = name_length_categorical(test)
    map_obj = {(test['name_length_bin'].value_counts().keys()[0].left, test['name_length_bin'].value_counts().keys()[0].right) : 0, (test['name_length_bin'].value_counts().keys()[1].left, test['name_length_bin'].value_counts().keys()[1].right): 1,(test['name_length_bin'].value_counts().keys()[2].left, test['name_length_bin'].value_counts().keys()[2].right):2}
    test['name_length_bin'] = test['name_length_bin'].map(lambda x: map_obj[(x.left, x.right)])

    drop_list = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'ticket_bin', 'cabin_nan', 'Fare']
    drop_list = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch','name_title','name_title_1', 'fare_bin', 'Pclass_fare_bin','name_length']
    
    test = pd.get_dummies(test, columns = cols_dummies)

    test = test.drop(columns = drop_list)##Incl cabin
    
    for col in categorical_features:
        if col in test.columns:
            try:
                le = LabelEncoder()
                test[col]= le.fit_transform(test[col])
            except:
                pass
        
    for col in numerical_features:
        le = MinMaxScaler()
        if col in test.columns:
            test[col]= le.fit_transform(np.array(test[col]).reshape(-1,1))
        
                
    return test

In [None]:
test = clean_data(test)

for col in [test.columns[i] for i in [1,3,4,5,6,7,8,10]]:
    print(col,':', any(test[col].isna()))
    
test = preprocess(test)

In [None]:
test

In [None]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission

In [None]:
pred = gam.predict(test)
gam_pred = [1 if i == True else 0 for i in pred ]

'''rf_pred = rf.predict(test)

lr_pred = lr.predict(test)

xgb_pred =xgb.predict(test)

vc_pred = vc.predict(test)'''

In [None]:
'''final_pred = [1 if (i + 2*j+k)/4 >=0.5 else 0 for i,j,k in zip(gam_pred, rf_pred, lr_pred)]'''

In [None]:
submission['Survived'] = rf.predict(test)#final_pred#vc_pred#final_pred #gam_pred #rf.predict(test) 

In [None]:
submission

In [None]:
submission.to_csv('./submission.csv', index = False)

## Correlation between features and target variable

In [None]:
sns.heatmap(test.corr(),
            #vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(50, 500, n=500),
            square=True)

In [None]:
plt.bar(dict(test.corrwith(submission["Survived"])).keys(), dict(test.corrwith(submission["Survived"])).values())

# Results:

- Submission #1 : Scored 0.57: Used only features Pclass  Sex       Age     SibSp  Parch      Fare  Embarked, and basic normalization + categorical encoding + RF with 500 estimators
- Submission #2: Scored 0.67 after dropping nan valued features while training + imputing test data with mean values
- Submission #3: Scored 0.75 after imputing nan values with the median of the feature
- Submission #4: Logistic GAM instead of RF improved score slightly (0.76)
- Submission #5: Scored 0.77 with RF(with optimized hyperparams) (Score improved to 0.772 w results of RF + twice weighted results of GAM)
- Submission #6: Scored 0.775 with feature ticket bin encoding and name title together inc accuracy to 0.775 +weighted GAM and RF prediction ~top 75% in Kaggle. Hmmm, lets see what else we can try.
- Added cabin_nan, accuracy dec
- => Features : [ Pclass, Sex, Age, SibSp, Parch, Fare, Embarked, name_title] +RF (optimized) =>0.7799
- => Features : [Pclass  Sex     Age  Embarked  name_title  family] +RF (optimized) => 0.78468
- => Features : [Pclass  Sex     Age  Embarked  family] +RF (optimized) => 0.78708