In [341]:
import os
import pandas as pd
path_train = '../datasets/titanic/train.csv'
path_test = '../datasets/titanic/test.csv'
train_set = pd.read_csv(path_train)
test_set = pd.read_csv(path_test)

In [342]:
train = train_set.drop('Survived', axis=1)
train_labels = train_set['Survived'].copy()


**There are 4 categorical features:**
* Survived
* Name
* Sex
* Embarked

**There are 6 numeric features:**
* PassengerId
* Pclass
* Age
* SibSp
* Parch
* Fare

**There are 2 alphanumeric feateures**
* Ticket
* Cabin


In [343]:
train.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,891,2,,,,681.0,,147,3
top,,,"Becker, Miss. Marion Louise",male,,,,1601.0,,G6,S
freq,,,1,577,,,,7.0,,4,644
mean,446.0,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,3.0,,,38.0,1.0,0.0,,31.0,,


In [344]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


* Drop columns with many NaN values (Cabin)

* Drop columns with non-sense values (Ticket & PassengerId)

* Fill Age with the median 

* Fill Embarked with the most frequent

**Maybe a new Feature such as Age/Pclass) that will attribute higher numbers to Old and Rich people and lower values to Young and Poor people**

Based on this we can have 4 catergories
In here we assume the Pclass they paid as a measurement of their economic status
The Pclass is also inverse correlated with the fare (so the class 1 pays higher fares; no surprise!)

* Young & Poor
* Young & Rich
* Old & Poor
* Old & Rich

In [345]:
from sklearn.impute import SimpleImputer

train_num = train[['Pclass','Age','SibSp','Parch','Fare']].copy()

#not to be used for now; will add later
#agepclass['Age/Pclass'] = train_num['Age'] / train_num['Pclass'] #higher age/fair means old and rich; lower means young and poor


imputer = SimpleImputer(strategy = 'median')
imputer.fit(train_num)
X = imputer.transform(train_num)
train_num = pd.DataFrame(X, columns=train_num.columns,index=train_num.index)
train_num.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925
3,1.0,35.0,1.0,0.0,53.1
4,3.0,35.0,0.0,0.0,8.05


In [346]:
train_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    float64
 1   Age     891 non-null    float64
 2   SibSp   891 non-null    float64
 3   Parch   891 non-null    float64
 4   Fare    891 non-null    float64
dtypes: float64(5)
memory usage: 34.9 KB


# Exploring the numerical features


PassengerId only represents the number of the entry (kind of like the index) so we are going to drop it

Analyse the numerical features

In [347]:
corr_matrix = train_num.corr()
corr_matrix['Pclass'].sort_values(ascending=False)

Pclass    1.000000
SibSp     0.083081
Parch     0.018443
Age      -0.339898
Fare     -0.549500
Name: Pclass, dtype: float64

The 'Age' , 'Fare' correlate well with Pclass but not the 'SibSp' or the 'Parch'

In [348]:
corr_matrix['SibSp'].sort_values(ascending=False)

SibSp     1.000000
Parch     0.414838
Fare      0.159651
Pclass    0.083081
Age      -0.233296
Name: SibSp, dtype: float64

The 'SibSp' (number of siblings per spouses) correlate well with the 'Parch' (number of parents per children)

# Exploring the categorical features

In [349]:
# Will not conside the name for this one. Too much work to sort that out

train_cat = train[['Sex','Embarked']].copy()
train_cat.fillna(method='ffill', axis=0, inplace=True)
train_cat.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [350]:
#one hot encoding
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
X = train_cat
train_cat_1hot = cat_encoder.fit_transform(train_cat)

# Pipeline for Numeric features

In [351]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])




train_num_tr = num_pipeline.fit_transform(train_num)

# Pipeline for Categorical features

In [352]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

# Pipeline for both Numeric and Categorical Features (final)

The problem here (i think) is the catergorical part of the pipeline because th OneHotEncoder() does not work with the NaN values of the Embarked that are in the 'train' dataset.

In [353]:
from sklearn.compose import ColumnTransformer
num_attribs = list(train_num)
cat_attribs = ['Sex','Embarked']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline,num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

train_prepared = full_pipeline.fit_transform(train)

In [354]:
# quick test
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

forest_reg = RandomForestClassifier(n_estimators=100)
forest_reg.fit(train_prepared, train_labels)
forest_pred = forest_reg.predict(train_prepared)
forest_mse = mean_squared_error(train_labels, forest_pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.1421338109037403

# Cross Val

In [355]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(forest_reg, train_prepared,train_labels,
                        scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-scores)

In [356]:
def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ',scores.mean())
    print('Standard deviation: ', scores.std())

In [357]:
display_scores(forest_rmse_scores)

Scores:  [0.51639778 0.41053541 0.48575205 0.41053541 0.35156152 0.42399915
 0.44971901 0.48575205 0.41053541 0.41053541]
Mean:  0.4355323228386002
Standard deviation:  0.04633359208309575


# Grid Search

In [358]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30,100,200], 'max_features': [3,4,6,8,'auto']},
    {'bootstrap':[False], 'n_estimators':[3,10,30,100,200],'max_features': [3,4,6,8,'auto'] }
]

forest_class = RandomForestClassifier()

grid_search = GridSearchCV(forest_class, param_grid, 
                          cv=5, 
                          scoring='neg_mean_squared_error',
                         return_train_score=True)
grid_search.fit(train_prepared, train_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [359]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 200}

In [360]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=6,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Final Score

In [363]:
final_model = grid_search.best_estimator_ 

X_test = test_set

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)



acc_random_forest = round(final_model.score(train_prepared, train_labels) * 100, 2)
acc_random_forest

97.98

# Submission

In [364]:
submission = pd.DataFrame({
        "PassengerId": test_set["PassengerId"],
        "Survived": final_predictions
    })
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [365]:
submission.to_csv('../datasets/titanic/submission.csv', index=False)

