In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import yellowbrick
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, accuracy_score
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.model_selection import LearningCurve

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Data Preparation**
1. Load dataset
2. Visualize dataset
3. Drop unnessesary data
4. Fill null data
5. Convert data to numeric

In [None]:
# Load data train and data test
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Visualize data train
print(df_train.shape)
df_train.head()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True, axis=1)
df_train.head()

In [None]:
df_train.Sex = df_train.Sex.map({'female': 0, 'male':1})
df_train.Embarked = df_train.Embarked.map({'S': 0, 'C': 1, 'Q':3})
df_train.head()

In [None]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.isnull().sum()

# **Split Data**

In [None]:
x = df_train.iloc[:, 1:]
y = df_train.iloc[:, 0]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Training Model**

**Random Forest Classifier**

In [None]:
# Model using RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=1)
model.fit(x_train, y_train)

In [None]:
# Predict

y_pred = model.predict(x_test)
accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [None]:
accuracy = (tp+tn) / (tp+tn+fp+fn)
print("accuracy: ", math.ceil(accuracy*100), " % ")

precision = tp / (tp+fp)
print("presisi: ", math.ceil(precision*100), " %")

recall = tp / (tp+fn)
print("recall ", math.ceil(recall*100), " %")

specificity = tn / (tn+fp)
print("specificity", math.ceil(specificity*100), " %")

In [None]:
sns.heatmap(cm,annot=True)

# **Test Dataset**

In [None]:
# Visualize data test
print(df_test.shape)
df_test.head()

In [None]:
df_test.isnull().sum()

In [None]:
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True, axis=1)
df_test.head()

In [None]:
df_test.Sex = df_test.Sex.map({'female': 0, 'male':1})
df_test.Embarked = df_test.Embarked.map({'S': 0, 'C': 1, 'Q':3})
df_test.head()

In [None]:
df_test['Age'] = df_test['Age'].fillna(df_train['Age'].mean())

In [None]:
df_test['Fare']=df_test['Fare'].fillna(df_test['Fare'].mean())

In [None]:
df_test.isnull().sum()

In [None]:
pred = model.predict(df_test)
pred

In [None]:
df_sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
df_sub.shape

In [None]:
submission = pd.DataFrame({'PassengerId': df_sub['PassengerId'], 'Survived': pred})
submission.to_csv('/kaggle/working/submission.csv', index=False, header=True)