In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import re

In [None]:
# Load the data:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv", sep = ",")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv", sep = ",")

In [None]:
df_train

In [None]:
df_test

In [None]:
# Deal with missing values:
df_test.loc[df_test["Fare"].isnull(), "Fare"] = df_test["Fare"].mean(skipna = True)
df_train.loc[df_train["Embarked"].isnull(), "Embarked"] = df_train["Embarked"].mode()
datasets = [df_train, df_test]
for dataset in datasets:
    mean = dataset["Age"].mean(skipna = True)
    std = dataset["Age"].std(skipna = True)
    num_na = dataset["Age"].isnull().sum()
    normal_age = np.random.randint(mean - std, mean + std, num_na)
    dataset.loc[dataset["Age"].isnull(), "Age"] = normal_age

In [None]:
# Change the categorical varibles into numeric:
for dataset in datasets:
    dummies_embarked = pd.get_dummies(dataset["Embarked"])
    dataset[["Embarked_" + i for i in dummies_embarked.columns.values.tolist()]] = dummies_embarked
    dummies_sex = pd.get_dummies(dataset["Sex"])
    dataset[["Sex_" + i for i in dummies_sex.columns.values.tolist()]] = dummies_sex


In [None]:
# Variables "relatives" and "alone":
for dataset in datasets:
    dataset["relatives"] = dataset["SibSp"] + dataset["Parch"]
    dataset["alone"] = [0 if i > 0 else 1 for i in dataset["relatives"]]


In [None]:
# Remove bad characters from "Name":
for dataset in datasets:
    names = dataset["Name"].values.tolist()
    dataset["Name"] = [re.sub('[",.()]', "", i) for i in names]


In [None]:
# Features:
predictors = [
    "Pclass",
    "Sex_female",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S",
    "relatives",
    "alone"
]


In [None]:
# Model:
x_train = df_train[predictors].values.reshape(-1, len(predictors))
y_train = df_train["Survived"].values

model = RandomForestClassifier(
    n_estimators = 100,
    max_depth = 3,
    random_state = 2
).fit(x_train, y_train)


In [None]:
# Predictions:
x_test = df_test[predictors].values.reshape(-1, len(predictors))
y_test = model.predict(x_test)

df_subm = df_test.copy()
df_subm["Survived"] = y_test
df_subm = df_subm[["PassengerId", "Survived"]]


In [None]:
df_subm.to_csv("submission.csv", sep = ",", index = False)