# Titanic - Machine Learning from Disaster - Accuracy Score - 84

### Import Modules

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

### Load Train Data

In [None]:
titanic_train = pd.read_csv("data/Titanic - Machine Learning from Disaster/train.csv")
titanic_train

### Remove Columns Not Required for Ananlysis

In [None]:
titanic_train.drop(
    labels=["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis=1, inplace=True
)

In [None]:
titanic_train.head()

### Remove Null Data and Fill Age based on SibSp and Parc
To file the `Age` we can estimate it based on numbers of `SibSp` and `Parch` and if they have it or not. The reason we are doing this is because our test set has `Age` with `NaN` and we don't want to remove it. So to populate the `Age` on test set we need a logic.

In [None]:
# Checksum Null Values
titanic_train.isnull().sum()

In [None]:
sns.countplot(x="SibSp", data=titanic_train)

In [None]:
sns.countplot(x="Parch", data=titanic_train)

In [None]:
sns.boxplot(x="Parch", y="Age", data=titanic_train)

In [None]:
sns.boxplot(x="SibSp", y="Age", data=titanic_train)

In [None]:
len(titanic_train[(titanic_train["SibSp"] != 0) & (titanic_train["Age"].isnull())])

In [None]:
len(titanic_train[(titanic_train["SibSp"] == 0) & (titanic_train["Age"].isnull())])

In [None]:
len(titanic_train[(titanic_train["Parch"] != 0) & (titanic_train["Age"].isnull())])

In [None]:
len(titanic_train[(titanic_train["Parch"] == 0) & (titanic_train["Age"].isnull())])

In [None]:
titanic_train["Age"].mean()

In [None]:
titanic_train[titanic_train["SibSp"] != 0]["Age"].mean()

In [None]:
titanic_train[titanic_train["Parch"] != 0]["Age"].mean()

In [None]:
titanic_train[(titanic_train["Parch"] != 0) & (titanic_train["SibSp"] != 0)][
    "Age"
].mean()

In [None]:
titanic_train[(titanic_train["Parch"] == 0) & (titanic_train["SibSp"] == 0)][
    "Age"
].mean()

In [None]:
titanic_train[titanic_train["SibSp"] == 0]["Age"].mean()

In [None]:
titanic_train[titanic_train["Parch"] == 0]["Age"].mean()

In [None]:
# titanic_train = titanic_train.fillna(0)

In [None]:
def estimate_age(dataset):
    for i in range(len(dataset)):
        if str(dataset["Age"][i]) == "nan":
            if dataset["SibSp"][i] != 0:
                if dataset["Parch"][i] != 0:
                    mean = int(
                        dataset[(dataset["Parch"] != 0) & (dataset["SibSp"] != 0)][
                            "Age"
                        ].mean()
                    )
                    dataset["Age"][i] = mean
                else:
                    mean = int(
                        dataset[(dataset["Parch"] == 0) & (dataset["SibSp"] != 0)][
                            "Age"
                        ].mean()
                    )
                    dataset["Age"][i] = mean
            else:
                mean = int(
                    dataset[(dataset["Parch"] == 0) & (dataset["SibSp"] == 0)][
                        "Age"
                    ].mean()
                )
                dataset["Age"][i] = mean
    return dataset

In [None]:
titanic_train = estimate_age(titanic_train)

In [None]:
# Checksum Null Values
titanic_train.isnull().sum()

In [None]:
titanic_train.dropna(axis=0, inplace=True)

In [None]:
# Checksum Null Values
titanic_train.isnull().sum()

In [None]:
titanic_train

### Analyze Data

In [None]:
titanic_train.Survived.unique()

In [None]:
type(titanic_train)

### Convert String Data to Numeric Data

In [None]:
Pclass = pd.get_dummies(titanic_train["Pclass"], drop_first=True)
Pclass

In [None]:
Sex = pd.get_dummies(titanic_train["Sex"], drop_first=True)
Sex.head()

In [None]:
Embarked = pd.get_dummies(titanic_train["Embarked"], drop_first=True)
Embarked.head()

### Adding the Converted Data to Original Dataset

In [None]:
titanic_train = pd.concat([titanic_train, Pclass, Sex, Embarked], axis=1)

In [None]:
titanic_train.head()

### Drop the Columns with String values.

In [None]:
titanic_train = titanic_train.drop(labels=["Pclass", "Sex", "Embarked"], axis=1)
titanic_train.head()

### Train and Test Data Preperation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = titanic_train.drop(labels=["Survived"], axis=1).copy()
y = titanic_train["Survived"]
x.shape, y.shape

In [None]:
x.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1, stratify=y
)

In [None]:
x_train

### Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
def run_random_forest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy on test set: ")
    print(accuracy_score(y_test, y_pred))

#### Linear Regression Estimate Coefficient

In [None]:
sel = SelectFromModel(LinearRegression())

In [None]:
sel.fit(x_train, y_train)

In [None]:
sel.get_support()

In [None]:
sel_feature = x.columns[sel.get_support()]
sel_feature

In [None]:
x_train_lin = sel.transform(x_train)
x_test_lin = sel.transform(x_test)

In [None]:
x_train_lin.shape, x_test_lin.shape

In [None]:
%%time
# Transformed Dataset
run_random_forest(x_train_lin, x_test_lin, y_train, y_test)

In [None]:
%%time
# Original Dataset
run_random_forest(x_train, x_test, y_train, y_test)

#### L1 Regularization (Lasso Regression)

In [None]:
sel = SelectFromModel(LogisticRegression(penalty="l1", C=0.1, solver="liblinear"))

In [None]:
sel.fit(x_train, y_train)
sel.get_support()

In [None]:
sel_feature = x_train.columns[sel.get_support()]
sel_feature

In [None]:
x_train_l1 = sel.transform(x_train)
x_test_l1 = sel.transform(x_test)

In [None]:
x_train_l1, x_test_l1

In [None]:
%%time
# Transformed Dataset
run_random_forest(x_train_l1, x_test_l1, y_train, y_test)

In [None]:
%%time
# Original Dataset
run_random_forest(x_train, x_test, y_train, y_test)

#### L2 Regularization (Ridge Regression)

In [None]:
sel = SelectFromModel(LogisticRegression(penalty="l2", C=0.1, solver="liblinear"))
sel.fit(x_train, y_train)
sel.get_support()

In [None]:
sel_feature = x_train.columns[sel.get_support()]
sel_feature

In [None]:
x_train_l2 = sel.transform(x_train)
x_test_l2 = sel.transform(x_test)

In [None]:
%%time
# Transformed Dataset
run_random_forest(x_train_l2, x_test_l2, y_train, y_test)

In [None]:
%%time
# Original Dataset
run_random_forest(x_train, x_test, y_train, y_test)

#### FScore and Chi

In [None]:
from sklearn.feature_selection import chi2

In [None]:
f_score = chi2(x_train, y_train)

In [None]:
p_value = pd.Series(f_score[1], index=x_train.columns)
p_value.sort_values(ascending=True, inplace=True)

In [None]:
p_value

In [None]:
x_train_f = x_train[["male", "Parch"]]
x_test_f = x_test[["male", "Parch"]]

In [None]:
%%time
run_random_forest(x_train_f, x_test_f, y_train, y_test)

### SVM 

In [None]:
import seaborn as sns
from sklearn import metrics, svm

In [None]:
clf = svm.SVC(kernel="linear")
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)

print("Accuracy: ", metrics.accuracy_score(y_test, y_predict))
print("Precision: ", metrics.precision_score(y_test, y_predict))
print("Recall: ", metrics.recall_score(y_test, y_predict))

print("Confusion Matrix")

mat = metrics.confusion_matrix(y_test, y_predict)
sns.heatmap(mat, square=True, annot=True, fmt="d", cbar=False)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

As we can see the highest accuracy is gained by training the model on `'Age', 'male'` using `L1 Regularization (Lasso Regression)`

### Prepare Test Data

In [None]:
# x_train = x_train_l1.copy()
# x_train

In [None]:
# y_train = y_train.copy()
# y_train

#### Load Test Data

In [None]:
validation_test = pd.read_csv("data/Titanic - Machine Learning from Disaster/test.csv")

In [None]:
validation_test.head()

In [None]:
validation_test.drop(labels=["Name", "Ticket", "Fare", "Cabin"], axis=1, inplace=True)

In [None]:
# validation_test = validation_test.fillna(0)

In [None]:
validation_test = estimate_age(validation_test)

In [None]:
validation_test.isnull().sum()

In [None]:
Pclass = pd.get_dummies(validation_test["Pclass"], drop_first=True)
Pclass.head()

In [None]:
Sex = pd.get_dummies(validation_test["Sex"], drop_first=True)
Sex.head()

In [None]:
Embarked = pd.get_dummies(validation_test["Embarked"], drop_first=True)
Embarked.head()

In [None]:
validation_test = pd.concat([validation_test, Pclass, Sex, Embarked], axis=1)

In [None]:
validation_test.head()

In [None]:
validation_test = validation_test.drop(labels=["Pclass", "Sex", "Embarked"], axis=1)
validation_test.head()

In [None]:
type(x_test)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(x_train, y_train)

In [None]:
x_train

In [None]:
validation_test

In [None]:
# validation_test = validation_test[["PassengerId", "Age", "SibSp", 3, "male"]]
# validation_test

In [None]:
validation_test.iloc[:, 1:].values

In [None]:
# person = []
# y_test = []
# for i in range(len(validation_test)):
#     person_id = validation_test.iloc[i,0:1].values
#     x_test = validation_test.iloc[i,1:].values
#     x_test = x_test.reshape(1, -1)
#     y_pred = clf.predict(x_test)
#     y_test.append(y_pred)
#     person.append(person_id)
#     print(str(person_id) + " " + str(y_pred))

In [None]:
x_test = validation_test.iloc[:, 1:].values
PassengerId = validation_test.PassengerId
y_pred = clf.predict(x_test)
df = pd.DataFrame({"PassengerId": PassengerId, "Survived": y_pred})
df.to_csv(
    "data/Titanic - Machine Learning from Disaster/gender_submission.csv", index=False
)
print(df)

In [None]:
x_test