In [1]:
import os
import pandas as pd

In [2]:
import numpy as np

In [3]:
TITANIC_PATH = os.path.join("datasets", "titanic")

In [4]:
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [5]:
test_data = load_titanic_data(r'test.csv')
train_data = load_titanic_data(r'train.csv')

In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

class add_new_rows(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["Relatives"] = X["SibSp"] + X["Parch"]
        X["AgeBucket"] = X["Age"] // 10 * 10
        X["If_relatives"] = (X["Relatives"].astype(int) == True).astype(int)
        X["Wife"] = ((X["Sex"] == "female") & X["SibSp"].astype(int))
        return X
adder = add_new_rows()

In [25]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

In [26]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [27]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse=False)

In [28]:
from sklearn.pipeline import Pipeline

cat_pipeline = Pipeline([
    ('most_frequent', MostFrequentImputer()),
    ('one_hot', one_hot)
])

In [43]:
num_cols = ["Survived", "Fare", "Relatives", "Parch", "SibSp", "Age", "AgeBucket"]
num_cols_test = ["Fare", "Relatives", "Parch", "SibSp", "Age", "AgeBucket"]

cat_cols = ["Sex", "Embarked", "Pclass"]

dropped_cols = ["PassengerId", "Cabin", "Name", "Ticket"]

In [91]:
from sklearn.preprocessing import normalize

In [77]:
from sklearn.compose import ColumnTransformer

preprocessing_pipeline = ColumnTransformer([
    ("numerical", imputer, num_cols),
    ("categories", cat_pipeline, cat_cols),
    ("dropped_cols", 'drop', dropped_cols)
])

preprocessing_pipeline_test = ColumnTransformer([
    ("numerical", imputer, num_cols_test),
    ("categories", cat_pipeline, cat_cols),
    ("dropped_cols", 'drop', dropped_cols)
])

In [78]:
train_data_extended = adder.fit_transform(train_data)
test_data_extended = adder.fit_transform(test_data)

In [79]:
train_data_processed = preprocessing_pipeline.fit_transform(train_data_extended)
test_data_processed = preprocessing_pipeline_test.fit_transform(test_data_extended)

In [80]:
col_names = np.concatenate([num_cols, cat_cols], axis=None)
col_names_test = np.concatenate([num_cols_test, cat_cols], axis=None)

In [81]:
train_data_processed = pd.DataFrame(train_data_processed)
train_data_processed = train_data_processed.drop(columns=train_data_processed.columns[[9, 10, 11, 12, 13]])

test_data_processed = pd.DataFrame(test_data_processed)
test_data_processed = test_data_processed.drop(columns=test_data_processed.columns[[9, 10, 11, 12, 13]])

In [82]:
train_data_processed.columns = col_names
test_data_processed.columns = col_names_test

In [84]:
train_data_processed

Unnamed: 0,Survived,Fare,Relatives,Parch,SibSp,Age,AgeBucket,Sex,Embarked,Pclass
0,0.0,7.2500,1.0,0.0,1.0,22.000000,20.000000,0.0,1.0,1.0
1,1.0,71.2833,1.0,0.0,1.0,38.000000,30.000000,1.0,0.0,0.0
2,1.0,7.9250,0.0,0.0,0.0,26.000000,20.000000,1.0,0.0,1.0
3,1.0,53.1000,1.0,0.0,1.0,35.000000,30.000000,1.0,0.0,0.0
4,0.0,8.0500,0.0,0.0,0.0,35.000000,30.000000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,13.0000,0.0,0.0,0.0,27.000000,20.000000,0.0,1.0,0.0
887,1.0,30.0000,0.0,0.0,0.0,19.000000,10.000000,1.0,0.0,0.0
888,0.0,23.4500,3.0,2.0,1.0,29.699118,25.252101,1.0,0.0,1.0
889,1.0,30.0000,0.0,0.0,0.0,26.000000,20.000000,0.0,1.0,0.0


In [136]:
a = normalize([train_data_processed["Fare"]], norm='l2')
a = a.T
a = pd.DataFrame(a)
train_data_processed["Fare"] = a
test_data_processed["Fare"] = a 

In [138]:
train_data_processed

Unnamed: 0,Survived,Fare,Relatives,Parch,SibSp,Age,AgeBucket,Sex,Embarked,Pclass
0,0.0,0.004103,1.0,0.0,1.0,22.000000,20.000000,0.0,1.0,1.0
1,1.0,0.040344,1.0,0.0,1.0,38.000000,30.000000,1.0,0.0,0.0
2,1.0,0.004485,0.0,0.0,0.0,26.000000,20.000000,1.0,0.0,1.0
3,1.0,0.030053,1.0,0.0,1.0,35.000000,30.000000,1.0,0.0,0.0
4,0.0,0.004556,0.0,0.0,0.0,35.000000,30.000000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.007358,0.0,0.0,0.0,27.000000,20.000000,0.0,1.0,0.0
887,1.0,0.016979,0.0,0.0,0.0,19.000000,10.000000,1.0,0.0,0.0
888,0.0,0.013272,3.0,2.0,1.0,29.699118,25.252101,1.0,0.0,1.0
889,1.0,0.016979,0.0,0.0,0.0,26.000000,20.000000,0.0,1.0,0.0


In [140]:
corr_matrix = train_data_processed.corr(method="pearson")
corr_matrix["Survived"].sort_values(ascending=False)

Survived     1.000000
Sex          0.543351
Fare         0.257307
Parch        0.081629
Relatives    0.016639
SibSp       -0.035322
AgeBucket   -0.066019
Age         -0.069809
Pclass      -0.322308
Embarked    -0.543351
Name: Survived, dtype: float64

In [141]:
# start looking for a model

In [142]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [196]:
train_data_processed_sampled = train_data_processed.sample(frac=0.6, random_state=42)

In [197]:
X_train = train_data_processed_sampled.drop(["Survived", "Relatives"], axis=1)
y_train = train_data_processed_sampled["Survived"]

In [198]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [199]:
from sklearn.model_selection import cross_val_score

In [200]:
score = cross_val_score(forest_clf, X_train, y_train, cv=5)
score.mean()

0.8205607476635514

In [201]:
y_pred = forest_clf.predict(X_train)

In [202]:
from sklearn.metrics import f1_score

f1_score(y_train, y_pred)

0.9928741092636579

In [203]:
import joblib

joblib.dump(forest_clf, "forest_clf.pkl")

['forest_clf.pkl']

In [204]:
predictions = forest_clf.predict(test_data_processed)

In [205]:
submission = test_data['PassengerId']
submission = pd.DataFrame(submission)

In [206]:
submission.columns = ['PassengerId']

In [207]:
submission['Survived'] = predictions.astype(int)

In [208]:
submission.to_csv(r'C:\Users\Sasha\Desktop\submission_titanic.csv', index=False)