# Install the modules

In [None]:
!pip install -q tensorflow_decision_forests

# Import the modules 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# ---- visualization ---- #
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# ---- Tensorflow ---- #
import tensorflow as tf
from tensorflow_decision_forests.keras import pd_dataframe_to_tf_dataset
from tensorflow_decision_forests.keras import GradientBoostedTreesModel
from tensorflow_decision_forests.keras import Task
# ---- Sklearn ---- #
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# ---- Evaluation ---- #
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Change visualization settings

In [None]:
plt.xkcd(True)
mpl.rcParams['figure.figsize'] = [11.0, 6.0]
sns.set_palette('pastel')

# Import the data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# Data visualization

best way to understand data 

let's look at the first five rows using the `head()` function

In [None]:
train_data.head()

let's look at the count of each label and check it's balance or isn't, for the work use `sns.countplot()`

In [None]:
sns.countplot(y='Survived',data=train_data)

and see count of male/female in titanic

In [None]:
sns.countplot(y='Sex',data=train_data)

also you can Separate survived by sex

In [None]:
sns.countplot(x='Survived', data=train_data, hue='Sex')

and last check Pclass of passengers that survived.

In [None]:
sns.countplot(x='Survived', data=train_data, hue='Pclass')

See age distribute in data with `sns.histplot()` that create a histogram of data 

In [None]:
sns.histplot(
    data=train_data,
    x='Age',
    hue='Survived',
    bins=30,
    kde=True,
    element="step",
    common_norm=False
)

# Data cleaning

Stop! before deleting data check the missing rows and coulmns \
next step is normaliztion the data \
and last step is remove useless columns and keep good features \
**Note:** apply all train data in test data

In [None]:
sns.heatmap(train_data.isnull(), yticklabels=False, cbar=False)

**PassengerId** don't contain any information and it'is like index of rows, , so drop this column with `dataframe.drop()`

In [None]:
train_data.drop('PassengerId', axis=1, inplace=True)

**Cabin** column have very miss value and I can't use it and fill miss value, so drop this column with `dataframe.drop()`

In [None]:
train_data.drop('Cabin', axis=1, inplace=True)

* challenge: How you can fill age miss value? 
        One solution is that I can use each Pclass average 
let's find the average with boxplot

In [None]:
sns.boxplot(x='Pclass',y='Age',data=train_data)

write a function that take the Pclass and return near age for miss value \
with `dataframe.apply()` i can change all miss value very simple and minimal

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

# Fill miss value
train_data['Age'] = train_data[['Age','Pclass']].apply(impute_age,axis=1)

I have one row miss in **Embarked** so fill this with mode of the column

In [None]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace = True)

# Feature Engineering

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

train_data['IsAlone'] = 1 #initialize to yes/1 is alone
train_data['IsAlone'].loc[train_data['FamilySize'] > 1] = 0

In [None]:
train_data['Title'] = train_data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

stat_min = 10 
title_names = (train_data['Title'].value_counts() < stat_min)

train_data['Title'] = train_data['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

In [None]:
train_data['FareBin'] = pd.qcut(train_data['Fare'], 4)

train_data['AgeBin'] = pd.cut(train_data['Age'].astype(int), 5)

In [None]:
le = LabelEncoder()


train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['AgeBin_Code'] = le.fit_transform(train_data['AgeBin'])
train_data['FareBin_Code'] = le.fit_transform(train_data['FareBin'])

Alright, now time to keep usefull columns, after convert categorical columns to One-hot

In [None]:
drop_cols = ['Name', 'Ticket', 'Fare', 'FareBin', 'AgeBin', 'Age', 'SibSp', 'Parch']

train_data.drop(drop_cols, axis=1, inplace=True)

In [None]:
label = 'Survived'
y = train_data[label]

In [None]:
col_to_onehot = ['Pclass', 'Embarked', 'FamilySize', 'Title', 'AgeBin_Code', 'FareBin_Code']
for col in col_to_onehot:
    encod = pd.get_dummies(train_data[col], prefix=col, dtype=int)
    train_data.drop(col, axis=1, inplace=True)
    train_data = pd.concat([train_data, encod], axis=1)

In [None]:
train_data.drop(label, inplace=True, axis=1)
#split data for evaluate before final answer (test model)
X_train, X_validation, y_train, y_validation = train_test_split(train_data, y, random_state=42)

# Convert data into tensorflow dataset 
train_ds = pd_dataframe_to_tf_dataset(
    pd.concat([X_train, y_train], axis=1),
    label=label,
    task=Task.CLASSIFICATION,
)

validation_ds = pd_dataframe_to_tf_dataset(
    pd.concat([X_validation, y_validation], axis=1),
    label=label,
    task=Task.CLASSIFICATION,
)

In [None]:
train_data.head()

before predict apply train data changing to test data

In [None]:
passenger_id = test_data.PassengerId # need for send answer in kaggle


test_data['Age'] = test_data[['Age','Pclass']].apply(impute_age,axis=1)

test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace = True)
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

test_data['IsAlone'] = 1 #initialize to yes/1 is alone
test_data['IsAlone'].loc[test_data['FamilySize'] > 1] = 0

test_data['Title'] = test_data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

title_names = (test_data['Title'].value_counts() < stat_min)

test_data['Title'] = test_data['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

test_data['FareBin'] = pd.qcut(test_data['Fare'], 4)

test_data['AgeBin'] = pd.cut(test_data['Age'].astype(int), 5)

test_data['Sex'] = le.fit_transform(test_data['Sex'])
test_data['AgeBin_Code'] = le.fit_transform(test_data['AgeBin'])
test_data['FareBin_Code'] = le.fit_transform(test_data['FareBin'])

drop_cols = ['Name', 'Ticket', 'Fare', 'FareBin', 'AgeBin', 'Age', 'SibSp', 'Parch']
test_data.drop(drop_cols, axis=1, inplace=True)

col_to_onehot = ['Pclass', 'Embarked', 'FamilySize', 'Title', 'AgeBin_Code', 'FareBin_Code']
for col in col_to_onehot:
    encod = pd.get_dummies(test_data[col], prefix=col, dtype=int)
    test_data.drop(col, axis=1, inplace=True)
    test_data = pd.concat([test_data, encod], axis=1)

# Convert data into tensorflow dataset 
test_ds = pd_dataframe_to_tf_dataset(test_data, task=Task.CLASSIFICATION)

# Create model

In [None]:
# object for calculate AUC in evaluate
auc = tf.metrics.AUC()

In [None]:
model = GradientBoostedTreesModel(
    task=Task.CLASSIFICATION,
    hyperparameter_template="benchmark_rank1",
    early_stopping_num_trees_look_ahead=10,
    forest_extraction='DART',
    subsample=0.8,
    max_depth=3,
    num_trees=25,
)


model.compile(metrics=[auc, tf.metrics.binary_accuracy])

history = model.fit(train_ds, validation_data=validation_ds)

Let's check model accuracy in test set 

In [None]:
plt.rcdefaults()

In [None]:
val_pred = model.predict(validation_ds)
# Convert probability to binary answer
val_pred = val_pred > .5
val_pred = val_pred.astype('int')

conf_mtrx = confusion_matrix(y_validation, val_pred)
sns.heatmap(conf_mtrx, annot=True, cbar=False, fmt='g')
plt.title('confusion matrix')
plt.xlabel('predicted')
plt.ylabel('actual')

In [None]:
clf_report = classification_report(y_validation, val_pred, labels=[0, 1], output_dict=True)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :-2].T, annot=True, cbar=False)

Save the model predictions in to a `csv` file for check in kaggle

In [None]:
predictions = model.predict(test_ds)

# Convert probability to binary answer
predictions = predictions >= 0.5
predictions = predictions.astype('int')


output = pd.DataFrame({'PassengerId': passenger_id, 'Survived': predictions.reshape((418))})
output.to_csv('tfgb_submission.csv', index=False)
print("Your submission was successfully saved!")

# The last word

### the notebook is useful? please like :)
### the notebook isn't useful? please comment why and how I can be better :)
