In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

In [3]:
titanic.shape

(891, 12)

In [4]:
titanic_train_labels = titanic['Survived'].copy()
titanic = titanic.drop(columns = 'Survived')

In [5]:
titanic.shape

(891, 11)

## Prepare the Data for Machine Learning Algorithms

### Data Cleaning

Here, pipelines are used for data cleaning. And for seeing another approach without pipelines, see data_cleaning_no_pipeline.ipynb

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
titanic_num = ['Age', 'Fare']
titanic_cat = ['Sex', 'Embarked']

In [8]:
num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy='median')),
        ("std_scaler", StandardScaler()),
    ])

In [9]:
cat_pipeline = Pipeline([
        ("enc", OneHotEncoder(drop='if_binary'))
    ])

In [10]:
# Combine pipelines in a ColumnTransformer
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, titanic_num),
    ("cat", cat_pipeline, titanic_cat)
])

In [11]:
# Fit and transform the training data
titanic_clean = full_pipeline.fit_transform(titanic)

In [12]:
titanic_clean[0] 

array([-0.56573646, -0.50244517,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ])

In [13]:
titanic_clean.shape

(891, 7)

## Select and Train a Model

### Training and Evaluating on the Training Set

*Train a Linear Regression model:*

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lin_reg = LinearRegression()
lin_reg.fit(titanic_clean, titanic_train_labels)

LinearRegression()

*Train a Decision Tree model:*

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(titanic_clean, titanic_train_labels)

DecisionTreeRegressor(random_state=42)

*Train a Random Forest model:*

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(titanic_clean, titanic_train_labels)

RandomForestRegressor(random_state=42)

See the models tested on some data: testing_on_some_data.ipynb

See the models tested via Cross-Validation method: testing_via_crossval.ipynb

## Launch the System

In [20]:
final_model = LinearRegression()  # found as the best model via testing_via_crossval.ipynb 

In [21]:
final_model

LinearRegression()

Preparing the "titanic_test.csv" data we never touched before for putting it into our final model:

In [22]:
# Transform the test data using the same transformations
titanic_test_clean = full_pipeline.transform(titanic_test)

In [23]:
titanic_test_clean.shape


(418, 7)

In [24]:
titanic_test_clean[0]

array([ 0.39488658, -0.49078316,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ])

Testing our final model on "titanic_test.csv" data:

In [25]:
# Model fitting and prediction
final_model.fit(titanic_clean, titanic_train_labels)
predictions = final_model.predict(titanic_test_clean)

In [26]:
from sklearn.metrics import mean_squared_error
final_rmse = mean_squared_error(gender_submission['Survived'], predictions.round(), squared=False)
print(final_rmse)

0.10936965981495178


In [27]:
# That's a very bad score
# Let's try out another model from testing_via_crossval.ipynb

In [28]:
final_model = RandomForestRegressor(max_features=4, n_estimators=24, random_state=42)

In [29]:
# Model fitting and prediction
final_model.fit(titanic_clean, titanic_train_labels)
predictions = final_model.predict(titanic_test_clean)

In [30]:
from sklearn.metrics import mean_squared_error
final_rmse = mean_squared_error(gender_submission['Survived'], predictions.round(), squared=False)
print(final_rmse)

0.412136457171825


In [31]:
# Looks like the best model testing_via_crossval.ipynb found is not the best one :)