In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

## Import Data

In [None]:
sub = pd.read_csv('../input/machine-learning-on-titanic-data-set/gender_submission.csv')
sub.head()

In [None]:
train = pd.read_csv('../input/machine-learning-on-titanic-data-set/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/machine-learning-on-titanic-data-set/test.csv')
test.head()

# Train Data

## Ngecek data train

In [None]:
train.info()

In [None]:
train['Embarked'].unique()

## Merapikan data train

In [None]:
# drop variabel
train.drop(["PassengerId","Name","Ticket","Cabin"], axis=1, inplace=True)

In [None]:
train.head()

### Ubah tipe data object menjadi numerik

In [None]:
train.info()

In [None]:
# Mengubah tipe data sex dengan label encoder
labelencoder = LabelEncoder()
train['Sex'] = labelencoder.fit_transform(train['Sex'])

In [None]:
# Mengubah tipe data embarked dengan replace
train['Embarked'].replace({'S': 0, 'C': 1, 'Q' : 2},inplace=True)

In [None]:
train.info()

## Mengecek Korelasi

In [None]:
#cek korelasi antar variabel di data2
corr = train.corr() 
plt.figure(figsize=(12, 10))
sns.heatmap(corr, 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 14}, square=True);

# Menangani duplikat dan missing value

In [None]:
# cek data duplikat
train.duplicated().sum()

In [None]:
# mengecek missing value
train.isnull().sum()

In [None]:
# Imputasi Age menggunakan median
train.dropna(subset=['Embarked', 'Age'], inplace=True)

In [None]:
# mengecek missing value lagi
train.isnull().sum()

# Test Data

In [None]:
# drop variabel
X_val = test.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)

In [None]:
# Mengubah tipe data sex dengan label encoder
labelencoder = LabelEncoder()
X_val['Sex'] = labelencoder.fit_transform(X_val['Sex'])

In [None]:
# Mengubah tipe data embarked dengan replace
X_val['Embarked'].replace({'S': 0, 'C': 1, 'Q' : 2},inplace=True)

In [None]:
# mengecek missing value
X_val.isnull().sum()

In [None]:
# Imputasi Fare menggunakan mean
X_val['Fare'].fillna(X_val.Fare.mean(),inplace=True)

In [None]:
# Imputasi Age menggunakan mean
X_val['Age'].fillna(X_val.Age.mean(),inplace=True)

In [None]:
# mengecek missing value lagi
X_val.isnull().sum()

In [None]:
X_val.info()

## Menyiapkan x dan y train

In [None]:
from sklearn.preprocessing import MinMaxScaler
train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']] = MinMaxScaler().fit_transform(train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']])

In [None]:
#For Denpendent feature we use 'y' variable
y = train['Survived']

In [None]:
#For Indenpendent feature we use 'X' variable
X = train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [None]:
#Spliting into training and testing dataset
X_train, X_test, y_train,y_test =  train_test_split(X,y,test_size=0.3,random_state=0)

# Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=50)
RFC.fit(X_train, y_train)

## Prediksi data test

In [None]:
y_pred= RFC.predict(X_test)
print(np.array(y_pred))

In [None]:
print(np.array(y_test))

## Model Score

In [None]:
print('akurasi train :', RFC.score(X_train,y_train))
print('akurasi test :', RFC.score(X_test,y_test))

In [None]:
# import evaluation metrics 
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report
# create the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# plot the confusion matrix
fig, ax = plt.subplots(dpi = 120)
plot_confusion_matrix(RFC, X_test, y_test, ax = ax);

## Grid Search

In [None]:
# grid search CV
from sklearn.model_selection import GridSearchCV

n_estimators=[25,50,60,70]
criterion=["gini","entropy"]
max_depth=[5,10,25,50,70]
max_leaf_nodes=[5,10,25,30]

In [None]:
RFC=RandomForestClassifier()
grid_rfc=GridSearchCV(estimator=RFC, cv=5, param_grid=dict(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes))
grid_rfc.fit(X_train,y_train)
print("best score: ", grid_rfc.best_score_)
print("best param: ", grid_rfc.best_params_)

## Model baru

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC_new = RandomForestClassifier(criterion='gini', max_depth=70, max_leaf_nodes=5, n_estimators=50)
RFC_new.fit(X_train, y_train)

In [None]:
y_pred= RFC_new.predict(X_test)
print(np.array(y_pred))

In [None]:
print('akurasi train :', RFC_new.score(X_train,y_train))
print('akurasi test :', RFC_new.score(X_test,y_test))

## Prediksi Data Validasi

In [None]:
y_val= RFC_new.predict(X_val)
print(np.array(y_val))

# Membuat file submission

In [None]:
predict = pd.Series(y_val)

In [None]:
predict

In [None]:
yoi = sub['PassengerId']
output = pd.DataFrame({'PassengerId': yoi, 'Survived': predict}) 
output.to_csv('submission.csv', index=False)