# Preparing data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
X = pd.read_csv("../input/titanic/train.csv", index_col="PassengerId")
X_test = pd.read_csv("../input/titanic/test.csv", index_col="PassengerId")

X

In [None]:
X_test

Drop all useless information

In [None]:
X = X.drop(["Name", "Ticket", "Cabin"], axis=1)
X_test = X_test.drop(["Name", "Ticket", "Cabin"], axis=1)
y = X.pop("Survived")

Fill NaN values

In [None]:
X.Embarked = X.Embarked.fillna("None")
X = X.fillna(0.0)

X_test.Embarked = X_test.Embarked.fillna("None")
X_test = X_test.fillna(0.0)

Encode categorical values with OneHotEncoding (like Sex, Embarked and etc.)

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
dfOh = pd.DataFrame( ohe.fit_transform(X[ ["Sex", "Embarked", "Pclass", "SibSp", "Parch"] ]) )
dfOh.index = X.index

X = pd.concat( [X, dfOh], axis=1 )

In [None]:
dfOh = pd.DataFrame( ohe.transform(X_test[ ["Sex", "Embarked", "Pclass", "SibSp", "Parch"] ]) )
dfOh.index = X_test.index

X_test = pd.concat( [X_test, dfOh], axis=1 )

Notice that we are encoding Pclass, SibSp, Parch but these features contains numbers.

Let's see on Pclass unique as example

In [None]:
X.Pclass.unique()

If we see on Pclass unique, we see that this feature may be useful (because it contains less numbers (or categorys from now) ) if we encode this with OneHotEncoder (Pclass was already encoded by Label Encoding)

You can try create predictions without encoding

In [None]:
X.head()

In [None]:
X_test.head()

And drop these categorical features

In [None]:
X.drop(["Sex", "Embarked", "Pclass", "SibSp", "Parch"], axis=1, inplace=True)
X_test.drop(["Sex", "Embarked", "Pclass", "SibSp", "Parch"], axis=1, inplace=True)

# Modeling

create a validation data and train data

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

create a functions which calculating a loss

In this case we use binary_crossentropy because we had binary classification problem

In [None]:
def BinaryCrossEntropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
    term_0 = (1-y_true) * np.log(1-y_pred + 1e-7)
    term_1 = y_true * np.log(y_pred + 1e-7)
    return -np.mean(term_0+term_1, axis=0)

In [None]:
def score(model):
    model.fit(X_train, y_train)
    return BinaryCrossEntropy(y_valid, model.predict(X_valid))

And main part: Create and train model!

We will use a LinearRegression but you can try use other model like RandomForestClassifier (from sklearn.ensemble import RandomForestClassifier) and even deep learning!

In [None]:
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)

In [None]:
score(model)

# Creating a submission

In [None]:
X_test

In [None]:
preds = model.predict(X_test)
    
df = pd.DataFrame({
    "PassengerId": X_test.index,
    "Survived": preds
})

In [None]:
df.to_csv("submission.csv", index=False)

In [None]:
df

##### I hope that this notebook helped you. Good luck!