In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
import matplotlib.pyplot as plt # matplotlib
from datetime import date

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv",index_col="PassengerId")
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/titanic/test.csv",index_col="PassengerId")
test_df.head()

In [None]:
fig, ax = plt.subplots(2,5,figsize=(25, 10))

sns.countplot(x=train_df["Survived"],ax=ax[0,0]) # count of survivors and deceased
sns.countplot(x=train_df["Sex"],ax=ax[0,1]) # Count of each sex

sns.barplot(x=train_df["Survived"],y=train_df["Pclass"],ax=ax[0,2]) # Pclass cat vs survival
sns.barplot(x=train_df["Survived"],y=train_df["Sex"],ax=ax[0,3]) # Sex vs survival
sns.boxplot(x=train_df["Survived"],y=train_df["Age"],ax=ax[0,4]) # Age vs survival
sns.boxplot(x=train_df["Survived"],y=train_df["Fare"],ax=ax[1,0]) # Fare cost vs survival
sns.barplot(x=train_df["Survived"],y=train_df["Embarked"],ax=ax[1,1]) # Embark category vs survived
sns.barplot(x=train_df["Survived"],y=train_df["SibSp"],ax=ax[1,2]) # SibSp category vs survived
sns.barplot(x=train_df["Survived"],y=train_df["Parch"],ax=ax[1,3]) # Parch category vs survived

train_df["Cabin"] = train_df["Cabin"].str[0]
sns.barplot(x=train_df["Survived"],y=train_df["Cabin"],ax=ax[1,4]) # Cabin vs survived

fig.show()

In [None]:
# Train a random forest model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [None]:
# encoding classes as one hot vectors
train_df = train_df.replace(np.nan, 0)


X_data = pd.get_dummies(data=train_df[["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]])
X_data.drop(labels=["Cabin_0","Embarked_0","Cabin_T"],axis=1,inplace=True)

y_data = train_df["Survived"]


X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,train_size=0.7,shuffle=True)

X_data.head()

# Random Forest

In [None]:
threshold = 0.5 # need to optimize the threshold
# find an optimal tree count
x_axis = []
y_axis = []
for numTrees in range(5,150,5):
    regressor = RandomForestRegressor(n_estimators=numTrees)
    regressor.fit(X_train, y_train)
    y_pred_prob = regressor.predict(X_test)
    x_axis.append(numTrees)
    y_axis.append(metrics.f1_score(y_test, y_pred_prob > threshold))

In [None]:
sns.scatterplot(x=x_axis,y=y_axis)
plt.xlabel("Number of Trees")
plt.ylabel("f1 score")

numTrees = x_axis[np.argmax(y_axis)] # max tree value

In [None]:
regressor = RandomForestRegressor(n_estimators=numTrees)
regressor.fit(X_train, y_train)

y_pred_prob = regressor.predict(X_test)
y_pred = y_pred_prob > threshold # assign a classification above a given threshold

confusion_matrix = metrics.confusion_matrix(y_test,y_pred > threshold)
accuracy_score = metrics.accuracy_score(y_test, y_pred)
precision_score = metrics.precision_score(y_test, y_pred)
recall_score = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

In [None]:
sns.heatmap(confusion_matrix,cbar=False,annot=True,square=True,fmt="d")
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
plt.xlabel("Predicted")
plt.ylabel("Actual")

print("accuracy:",accuracy_score)
print("precision:",precision_score)
print("recall:",recall_score)
print("f1 score:",f1_score)

In [None]:
# assign predictions to unlabeled data set
test_df = test_df.replace(np.nan, 0)
test_df["Cabin"] = test_df["Cabin"].str[0]

X_data_unk = pd.get_dummies(data=test_df[["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]])

y_pred_prob = regressor.predict(X_data_unk)
y_pred = y_pred_prob > threshold # assign a classification above a given threshold
test_df["Survived"] = y_pred.astype(int) # append to result to unlabeled data

In [None]:
result_df = pd.DataFrame(test_df["Survived"])
result_df.head() # submission preview

In [None]:
result_df.to_csv("/kaggle/working/submission_randomforest{}.csv".format(date.today()))

# XGBoost

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
y_pred = clf.predict(X_data_unk)

In [None]:
test_df["Survived"] = y_pred.astype(int)
result_df.head()

In [None]:
result_df.to_csv("/kaggle/working/submission_xgboost{}.csv".format(date.today()))