# Overview of the data

In [None]:
#import numpy as np
import pandas as pd
import seaborn as sns #graphs to be added, maybe along some machine learning explainability?

# Load the data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv", index_col='PassengerId')
test_data  = pd.read_csv("/kaggle/input/titanic/test.csv", index_col='PassengerId')

# Print shape of datasets
print('Train Data Shape:', train_data.shape)
print('\nTest Data Shape:', test_data.shape)

# Print missing values of datasets
print('\nMissing Values on Train Data:')
for col in train_data.columns:
    missing_train= train_data[col].isnull().sum()
    if missing_train>0:
        print(col, missing_train, "{0:.2%}".format(missing_train/train_data.shape[0]))

print('\nMissing Values on Test Data:')
for col in test_data.columns:
    missing_test= test_data[col].isnull().sum()
    if missing_test>0:
        print(col, missing_test, "{0:.2%}".format(missing_test/test_data.shape[0]))
        

# Separate features acording to data type
con = [col for col in train_data.columns if train_data[col].dtypes == "float"]
dis = [col for col in train_data.columns if train_data[col].dtypes == "int"]
cat = [col for col in train_data.columns if train_data[col].dtypes == "object"]

print('\nContinuous features:\n', con)
print('\nDiscrete features:\n', dis)
print('\nCategorical features:\n', cat)

# See which features are easy to encode and which aren't
print('\nUnique categorical values in Train Data:')
for col in cat:
    print(col,train_data[col].nunique())
    
print('\nUnique categorical values in Test Data:')
for col in [col for col in test_data.columns if test_data[col].dtypes == "object"]:
    print(col,test_data[col].nunique())

# Plannification

Continuous features:
* Age  = **Impute median, add indicator** (missing values = ~20% of both training and test data)
* Fare = **Impute median** (missing values = 1 on test data)

Discrete features:
* Pclass = Ready
* SibSp = Ready
* Parch = Ready

Categorical features:
* Sex = **Ordinal Encode**
* Embarked = **Ordinal Encode** (missing values = 2 on training data)
* Name = **Drop** or Feature Engineer (all unique values)
* Ticket = **Drop** or Feature Engineer (681 unique values on training data)
* Cabin = **Drop** or Feature Engineer (147 unique values on training data) (missing values = 70%+ of both training and test data)

# Researching Titanic decks (for Feature Engineering Cabin)

Order of Decks from top to bottom (reverse for flooding order)
* 1st = T
* 2nd = A
* 3rd = B
* 4th = C
* 5th = D
* 6th = E
* 7th = F
* 8th = G (Most of it off limits to passengers)
* 9th = Orlop (Completely off limits to passengers)
* 10th = Tank Top

In [None]:
#Until I find a way to do Impute and Encode it through a more "refined" code...

train_data.Cabin.fillna("NA", inplace=True)

decks=[]
for cabin in train_data.Cabin:
    decks.append(str(cabin)[0])
    
train_data['Deck'] = pd.Series(decks, index=train_data.index)
train_data.Deck.replace(['N','T','A','B','C','D','E','F','G'], range(0,9), inplace=True)

train_data['cabin']=1
train_data.loc[train_data.Cabin==0,'cabin']=0

test_data.Cabin.fillna("NA", inplace=True)
decks=[]
for cabin in test_data.Cabin:
    decks.append(str(cabin)[0])
    
test_data['Deck'] = pd.Series(decks, index=test_data.index)
test_data.Deck.replace(['N','T','A','B','C','D','E','F','G'], range(0,9), inplace=True)

test_data['cabin']=1
test_data.loc[test_data.Cabin==0,'cabin']=0

# Preprocessing and model training

In [None]:
to_drop = ["Name","Ticket","Cabin"]

y_train = train_data.Survived
X_train = train_data.drop(to_drop, axis = 1).drop("Survived", axis = 1)
X_test  =  test_data.drop(to_drop, axis = 1)

from sklearn.pipeline      import Pipeline
from sklearn.impute        import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose       import ColumnTransformer
from sklearn.ensemble      import RandomForestClassifier

ageSI  = SimpleImputer(strategy = "median", add_indicator=True)
fareSI = SimpleImputer(strategy = "median")

sexembOHE = Pipeline(steps=[("imputer", SimpleImputer(strategy = "most_frequent")),\
                           ("ohenc", OneHotEncoder(handle_unknown='ignore', sparse=False))])
                                  
preprocessor = ColumnTransformer(transformers=[('age',ageSI, ['Age']),\
                                               ('fare',fareSI, ['Fare']),\
                                               ('sex_emb', sexembOHE, ['Sex','Embarked'])])

model = RandomForestClassifier( n_estimators=100,    #default=100
                                max_depth=6,         #default=None
                                min_samples_split=8, #default=2,
                                min_samples_leaf=1,  #default=1,
                                bootstrap=True,      #default=True
                                ccp_alpha=0.0,       #default=0.0
                                random_state=None)
#finding optimal hyper parameters pending...

classifier = Pipeline(steps=[("preprocessor",preprocessor),("model",model)])

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(classifier, X_train, y_train)
print("Average CV score:", scores.mean())

+ n_estimators=87: 0.7789215993974012
+ **n_estimators=88: 0.7800451949030192** 0.7710815391375305
+ **n_estimators=89: 0.7800451949030192** 0.7688280710564308
+ n_estimators=90: 0.778927876467265

+ max depth=5: 0.7934718473416609
+ **max depth=6: 0.7957378695624883** 0.7957253154227607
+ max depth=7:   0.7901198920343984

+ min_samples_split=15 0.7867679367271359
+ **min_samples_split=16: 0.7957441466323519** 0.7946142740568704
+ min_samples_split=17: 0.7811499591990458

+ min_samples_leaf=2: 0.7867679367271357
+ **min_samples_leaf=3: 0.7901198920343984** 0.7923608059757704
+ min_samples_leaf=4: 0.7856443412215177

+ **bootstrap=True: 0.7778105580315108**
+ bootstrap=False: 0.7575983930701149

In [None]:
output = pd.DataFrame({'PassengerId': test_data.index, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")