In [83]:
# 2. Acquire data

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_train_data():
    csv_path = os.path.join(TITANIC_PATH, "train.csv")
    return pd.read_csv(csv_path)

def load_titanic_test_data():
    csv_path = os.path.join(TITANIC_PATH, "test.csv")
    return pd.read_csv(csv_path)

train_data = load_titanic_train_data()
test_data = load_titanic_test_data()
combined_data = [train_data, test_data]


In [84]:


for dataset in combined_data:        
    dataset["RelativesOnBoard"] = dataset["SibSp"] + train_data["Parch"]
    
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3

train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,RelativesOnBoard
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,0.0,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,3.0,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,1.0,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,3.0,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,1.0,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,1.0,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,2.0,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,2.0,,S,3
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,2.0,C148,C,0


In [85]:
# dropping data
for dataset in combined_data :
    del dataset['Parch']
    del dataset['SibSp']
    del dataset['Name']
    del dataset['Cabin']

In [86]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])


num_attribs = ["Age", "RelativesOnBoard", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])

train_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,RelativesOnBoard
0,1,0,3,male,22.0,A/5 21171,0.0,S,1
1,2,1,1,female,38.0,PC 17599,3.0,C,1
2,3,1,3,female,26.0,STON/O2. 3101282,1.0,S,0
3,4,1,1,female,35.0,113803,3.0,S,1
4,5,0,3,male,35.0,373450,1.0,S,0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,211536,1.0,S,0
887,888,1,1,female,19.0,112053,2.0,S,0
888,889,0,3,female,,W./C. 6607,2.0,S,3
889,890,1,1,male,26.0,111369,2.0,C,0


In [87]:
y_train = train_data["Survived"]


In [88]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [89]:
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)

In [90]:
# cross-val forest
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8036704119850187

In [91]:
# svc
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.8215480649188514

In [92]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('./submission.csv', index=False)