# Titanic Dataset with linear classifier

In [None]:
import os

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [None]:
train_data = pd.read_csv(os.path.join("..", "input", "titanic-machine-learning-from-disaster", "train.csv"))
test_data = pd.read_csv(os.path.join("..", "input", "titanic-machine-learning-from-disaster", "test.csv"))

In [None]:
train_data.head()

The columns `PassengerId`, `Ticket`, and `Cabin` are considered not relevant, `Survived` is the target values, all other columns are features.

In [None]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
labels = ["Survived"]

In [None]:
X_train = train_data[features]
y_train = np.ravel(train_data[labels])

X_test = test_data[features]

In [None]:
X_train.head()

Check for columns with missing values.

In [None]:
def nan_clms(df):
    return [clm for clm in X_train.columns if df[clm].isnull().any()]

print("Columns with missing values in train data: {}.".format(nan_clms(X_train)))
print("Columns with missing values in test data: {}.".format(nan_clms(X_test)))

I impute numerical columns with the median. The imputation of `Embarked` is taken care by the on-hot encoding. I apply one-hot encoding to the categorical columns and always drop one. This way e.g. `Sex` generates only one column instead of two. 

The model is the linear classifier, aka ridge classifier with least squares. It is the same as seen in [Learning drom Data](http://work.caltech.edu/telecourse.html).

In [None]:
preprocessor = ColumnTransformer(
    [("num_imputer", SimpleImputer(strategy="median"), ["Age", "Fare"]),
     ("encoder", OneHotEncoder(drop="first"), ["Pclass", "Sex", "Embarked"])], 
    remainder = "passthrough")

model = RidgeClassifier(solver="lsqr")

pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("model", model)])

In [None]:
cv_score = np.mean(cross_val_score(pipe, X_train, y_train, cv=10))
print("The linear classifier has an accuracy of: {:.3f}.".format(cv_score))

In [None]:
test_data = pd.read_csv(os.path.join("..", "input", "titanic-machine-learning-from-disaster", "test.csv"))
X_test = test_data[features]

Save the data for the submission, as seen in the [tutorial](https://www.kaggle.com/alexisbcook/titanic-tutorial).

In [None]:
pipe.fit(X_train, y_train)
predictions  = pipe.predict(X_test)
output = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": predictions})
output.to_csv("submission.csv", index=False)