# **Titanic Survival Preiction**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
palette=sns.color_palette('magma')
sns.set(palette=palette)

In [None]:
train_data=pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
test_data=pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')

train_data.head()

# **EDA**

In [None]:
#Dropping the Name and Ticket columns because i dont think i can make use of them for now
train_data_clean=train_data.drop(['Name','Ticket'], axis=1)

In [None]:
train_data_clean.info()

In [None]:
#I noticed that there are over 600 null values in the Cabin column which is over 70%
#Extracting the cabin letter into a new column
train_data_clean['Cabin Letter'] = train_data_clean['Cabin'].str.extract('(\w)')
train_data_clean.head()

In [None]:
#Filling the null cabin letters with 'unknown' as a value
train_data_clean['Cabin Letter'].fillna('Unknown', inplace=True)
train_data_clean['Cabin Letter'].value_counts().plot(kind='bar')

train_data_clean['Cabin Letter'].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Dropping the Cabin column
train_data_clean.drop(['Cabin'], axis=1, inplace=True)

In [None]:
#Embarked has only 2 missing values so im gonna drop them
train_data_clean = train_data_clean[train_data_clean['Embarked'].notna()]
train_data_clean.info()
#Age has some missing values ill be back to that later

In [None]:
#Creating a new feature 'relatives' which consists of SibSp+Parch
train_data_clean['relatives']=train_data_clean['SibSp']+train_data_clean['Parch']

In [None]:
train_data_clean['Survived'].value_counts().plot(kind='bar')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

In [None]:
sns.stripplot(data=train_data_clean, x='Survived', y='Age')
plt.show()

In [None]:
sns.stripplot(data=train_data_clean, x='Survived', y='Fare')
plt.show()

In [None]:
train_data_clean['Embarked'].value_counts().plot(kind='bar')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()

In [None]:
train_data_clean['Pclass'].value_counts().plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.show()

In [None]:
#Correlation Heatmap
plt.figure(figsize=(16,10))
sns.heatmap(train_data_clean.corr(), annot=True)
plt.show()

# **Preprocessing**

In [None]:
X= train_data_clean.drop('Survived', axis=1)
y= train_data_clean['Survived']

X.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

#Select categorical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
categorical_cols

In [None]:
#Select numerical columns
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
numerical_cols

In [None]:
#Preprocessing for numerical data(Age in the train)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
#Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  #handle unknown wont give an error if it gets a new value in the test set for the categorical values
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
preprocessor

# **Model Pipelines**

Note: I determined the model parameters using gridsearchCV but didnt include it in this notebook. 

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth= 6, min_samples_leaf= 1, min_samples_split= 2, random_state=0)
rfc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', rfc)
                              ])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc_pipeline, X, y, cv=5)
print('Random Forest Scores:',scores)
print('Random Forest:',scores.mean())


In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg_classifier=LogisticRegression(C= 0.1, penalty= 'l2', solver='liblinear',random_state=0)
log_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', log_reg_classifier)
                                  ])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg_pipeline, X, y, cv=5)
print('Logistic Regression Scores:',scores)
print('Logistic Regression:',scores.mean())

In [None]:
#XGBoost
from xgboost import XGBClassifier
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate= 0.1, max_depth= 5, min_child_weight= 1,
                    n_estimators= 100, subsample= 1.0, use_label_encoder=False,
                    verbosity=0, random_state=0)
xgb_pipeline= Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb)
                             ])
scores = cross_val_score(xgb_pipeline, X, y, cv=5)
print('XGB Classifier Scores:',scores)
print('XGB Classifier:',scores.mean())

In [None]:
#SVC
from sklearn.svm import SVC
svc = SVC(C= 3, kernel= 'rbf', random_state=0, probability=True)
svc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', svc)
                              ])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svc_pipeline, X, y, cv=5)
print('SVC Scores:',scores)
print('SVC:',scores.mean())

In [None]:
#MLP
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(max_iter=2000, random_state=0, early_stopping=True)
mlp_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', mlp)
                              ])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(mlp_pipeline, X, y, cv=5)
print('MLP Scores:',scores)
print('MLP:',scores.mean())

# **Super Learning - Best Model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from mlens.ensemble import SuperLearner
from sklearn.metrics import accuracy_score
 
# create a list of base-models
def get_models():
    models = list()    
    models.append(log_reg_classifier)
    models.append(svc)
    models.append(rfc)
    return models
 
# create the super learner
def get_super_learner(X):
    ensemble = SuperLearner(scorer=accuracy_score, folds=10, sample_size=len(X))
    # add base models
    models = get_models()
    ensemble.add(models)
    # add the meta model
    ensemble.add_meta(LogisticRegression())
    return ensemble

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [None]:
#Create the super learner
ensemble = get_super_learner(X)
#Fit the super learner
ensemble.fit(X_train, y_train)
#Summarize base learners
print(ensemble.data)
#Make predictions on hold out set
yhat = ensemble.predict(X_test)
print('Super Learner: %.3f' % (accuracy_score(y_test, yhat) * 100))

Note: We cant compare the base learner scores in the table to the super learner because the base learners were evaluated on the training dataset only, not the holdout dataset.

In [None]:
#Training the final model
X=preprocessor.fit_transform(X)
ensemble.fit(X,y)



#Test data
test_data=pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')
test_data_clean=test_data.drop(['Name','Ticket'], axis=1)
test_data_clean['Cabin Letter'] = test_data_clean['Cabin'].str.extract('(\w)')
test_data_clean['Cabin Letter'].fillna('Unknown', inplace=True)
test_data_clean.drop(['Cabin'], axis=1, inplace=True)
test_data_clean['relatives']=test_data_clean['SibSp']+test_data_clean['Parch']

test_data_clean=preprocessor.transform(test_data_clean)

predicted=ensemble.predict(test_data_clean).astype(int)
ids=test_data.index
pred_df= pd.DataFrame({'PassengerId': ids,
                       'Survived': predicted})
pred_df.to_csv('submission.csv', index=False)

In [None]:
pd.DataFrame(predicted).value_counts()