In [1]:
import os
import pandas as pd

In [2]:
import numpy as np

In [3]:
TITANIC_PATH = os.path.join("datasets", "titanic")

In [4]:
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [5]:
test_data = load_titanic_data(r'test.csv')
train_data = load_titanic_data(r'train.csv')

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# Age, Cabin and Embarced are missing some values
# Name, Sex, Ticket, Cabin and Embarked are objects

In [9]:
train_data[['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Pclass']]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,3
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,1
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,3
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,1
4,"Allen, Mr. William Henry",male,373450,,S,3
...,...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S,2
887,"Graham, Miss. Margaret Edith",female,112053,B42,S,1
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S,3
889,"Behr, Mr. Karl Howell",male,111369,C148,C,1


In [10]:
train_data["Relatives"] = train_data["SibSp"] + train_data["Parch"]
train_data["AgeBucket"] = train_data["Age"] // 10 * 10

In [53]:
test_data["Relatives"] = train_data["SibSp"] + train_data["Parch"]
test_data["AgeBucket"] = train_data["Age"] // 10 * 10

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [13]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse=False)

In [14]:
from sklearn.pipeline import Pipeline

cat_pipeline = Pipeline([
    ('most_frequent', MostFrequentImputer()),
    ('one_hot', one_hot)
])

In [63]:
num_cols = ["Survived", "Fare", "Relatives", "Parch", "SibSp", "Age", "AgeBucket"]
num_cols_test = ["Fare", "Relatives", "Parch", "SibSp", "Age", "AgeBucket"]
cat_cols = ["Sex", "Embarked", "Pclass"]
dropped_cols = ["PassengerId", "Cabin", "Name", "Ticket"]

In [62]:
from sklearn.compose import ColumnTransformer

preprocessing_pipeline = ColumnTransformer([
    ("numerical", imputer, num_cols),
    ("categories", cat_pipeline, cat_cols),
    ("dropped_cols", 'drop', dropped_cols)
])

In [80]:
preprocessing_pipeline_test = ColumnTransformer([
    ("numerical", imputer, num_cols_test),
    ("categories", cat_pipeline, cat_cols),
    ("dropped_cols", 'drop', dropped_cols)
])

In [17]:
train_data_processed = preprocessing_pipeline.fit_transform(train_data) 

In [81]:
test_data_processed = preprocessing_pipeline_test.fit_transform(test_data)

In [82]:
col_names = np.concatenate([num_cols, cat_cols], axis=None)
col_names_test = np.concatenate([num_cols_test, cat_cols], axis=None)

In [19]:
train_data_processed = pd.DataFrame(train_data_processed)
train_data_processed = train_data_processed.drop(columns=train_data_processed.columns[[9, 10, 11, 12, 13]])

In [84]:
test_data_processed = pd.DataFrame(test_data_processed)
test_data_processed = test_data_processed.drop(columns=test_data_processed.columns[[9, 10, 11, 12, 13]])
test_data_processed

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,7.8292,1.0,0.0,0.0,34.50000,20.000000,0.0,1.0,0.0
1,7.0000,1.0,0.0,1.0,47.00000,30.000000,1.0,0.0,0.0
2,9.6875,0.0,0.0,0.0,62.00000,20.000000,0.0,1.0,0.0
3,8.6625,1.0,0.0,0.0,27.00000,30.000000,0.0,1.0,0.0
4,12.2875,0.0,1.0,1.0,22.00000,30.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
413,8.0500,0.0,0.0,0.0,30.27259,24.238806,0.0,1.0,0.0
414,108.9000,0.0,0.0,0.0,39.00000,40.000000,1.0,0.0,1.0
415,7.2500,0.0,0.0,0.0,38.50000,24.238806,0.0,1.0,0.0
416,8.0500,2.0,0.0,0.0,30.27259,30.000000,0.0,1.0,0.0


In [20]:
train_data_processed.columns = col_names
train_data_processed

Unnamed: 0,Survived,Fare,Relatives,Parch,SibSp,Age,AgeBucket,Sex,Embarked,Pclass
0,0.0,7.2500,1.0,0.0,1.0,22.000000,20.000000,0.0,1.0,1.0
1,1.0,71.2833,1.0,0.0,1.0,38.000000,30.000000,1.0,0.0,0.0
2,1.0,7.9250,0.0,0.0,0.0,26.000000,20.000000,1.0,0.0,1.0
3,1.0,53.1000,1.0,0.0,1.0,35.000000,30.000000,1.0,0.0,0.0
4,0.0,8.0500,0.0,0.0,0.0,35.000000,30.000000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,13.0000,0.0,0.0,0.0,27.000000,20.000000,0.0,1.0,0.0
887,1.0,30.0000,0.0,0.0,0.0,19.000000,10.000000,1.0,0.0,0.0
888,0.0,23.4500,3.0,2.0,1.0,29.699118,25.252101,1.0,0.0,1.0
889,1.0,30.0000,0.0,0.0,0.0,26.000000,20.000000,0.0,1.0,0.0


In [85]:
test_data_processed.columns = col_names_test
test_data_processed

Unnamed: 0,Fare,Relatives,Parch,SibSp,Age,AgeBucket,Sex,Embarked,Pclass
0,7.8292,1.0,0.0,0.0,34.50000,20.000000,0.0,1.0,0.0
1,7.0000,1.0,0.0,1.0,47.00000,30.000000,1.0,0.0,0.0
2,9.6875,0.0,0.0,0.0,62.00000,20.000000,0.0,1.0,0.0
3,8.6625,1.0,0.0,0.0,27.00000,30.000000,0.0,1.0,0.0
4,12.2875,0.0,1.0,1.0,22.00000,30.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
413,8.0500,0.0,0.0,0.0,30.27259,24.238806,0.0,1.0,0.0
414,108.9000,0.0,0.0,0.0,39.00000,40.000000,1.0,0.0,1.0
415,7.2500,0.0,0.0,0.0,38.50000,24.238806,0.0,1.0,0.0
416,8.0500,2.0,0.0,0.0,30.27259,30.000000,0.0,1.0,0.0


In [21]:
train_data_processed["Wife"] = (train_data_processed["Sex"].astype(int) & train_data_processed["SibSp"].astype(int))

In [87]:
test_data_processed["Wife"] = (test_data_processed["Sex"].astype(int) & test_data_processed["SibSp"].astype(int))

In [91]:
train_data_processed["If_relatives"] = (train_data_processed["Relatives"].astype(int) == True).astype(int)

In [100]:
test_data_processed["If_relatives"] = (test_data_processed["Relatives"].astype(int) == True).astype(int) 

In [23]:
corr_matrix = train_data_processed.corr(method="pearson")
corr_matrix["Survived"].sort_values(ascending=False)

Survived        1.000000
Sex             0.543351
Wife            0.263535
Fare            0.257307
If_relatives    0.163157
Parch           0.081629
Relatives       0.016639
SibSp          -0.035322
AgeBucket      -0.066019
Age            -0.069809
Pclass         -0.322308
Embarked       -0.543351
Name: Survived, dtype: float64

In [24]:
# start looking for a model

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [94]:
X_train = train_data_processed.drop(["Survived", "Relatives"], axis=1)
y_train = train_data_processed["Survived"]

In [167]:
forest_clf.fit(X_train[:400], y_train[:400])

RandomForestClassifier(random_state=42)

In [168]:
from sklearn.model_selection import cross_val_score

In [169]:
score = cross_val_score(forest_clf, X_train, y_train, cv=5)
score.mean()

0.8081413596133326

In [170]:
y_pred = forest_clf.predict(X_train)

In [171]:
from sklearn.metrics import f1_score

f1_score(y_train, y_pred)

0.8277945619335347

In [143]:
import joblib

joblib.dump(forest_clf, "forest_clf.pkl")

['forest_clf.pkl']

In [145]:
test_data_processed = test_data_processed.drop('Relatives', axis=1)

KeyError: "['Relatives'] not found in axis"

In [146]:
predictions = forest_clf.predict(test_data_processed)

In [147]:
submission = test_data['PassengerId']
submission = pd.DataFrame(submission)

In [148]:
submission.columns = ['PassengerId']

In [149]:
submission['Survived'] = predictions.astype(int)

In [150]:
submission.to_csv(r'C:\Users\Sasha\Desktop\submission_titanic.csv', index=False)