In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from IPython.display import display
from sklearn.impute import SimpleImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import Data

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
display(train_df)
display(test_df)
#print(train_df.dtypes)

# 2. Preprocessing

## 2.1 remove useless columns

In [None]:
drop_col = ["PassengerId", "Name", "Ticket"]
train_df = train_df.drop(columns = drop_col)

test_PassengerID = list(test_df["PassengerId"])
test_df = test_df.drop(columns = drop_col)

In [None]:
display(train_df)
display(test_df)
print("Number of NaN in train df:\n", train_df.isna().sum())
print("\n")
print("Number of NaN in test df:\n", test_df.isna().sum())

## 2.2 fill missing values

In [None]:
mean_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
freq_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')


for col in ["Age", "Fare"]:
    mean_imputer.fit(np.array(train_df[col]).reshape(-1,1))
    train_df[col] = list(mean_imputer.transform(np.array(train_df[col]).reshape(-1,1)).flatten())
    test_df[col] = list(mean_imputer.transform(np.array(test_df[col]).reshape(-1,1)).flatten())

for col in ["Embarked"]:
    freq_imputer.fit(np.array(train_df[col]).reshape(-1,1))
    train_df[col] = list(freq_imputer.transform(np.array(train_df[col]).reshape(-1,1)).flatten())
    test_df[col] = list(freq_imputer.transform(np.array(test_df[col]).reshape(-1,1)).flatten())

In [None]:
print("Number of NaN in train df:\n", train_df.isna().sum())
print("\n")
print("Number of NaN in test df:\n", test_df.isna().sum())

## 2.3 create dummy variables

In [None]:
train_df_dummy = pd.get_dummies(data = train_df,
                                dummy_na  = True,
                                columns = ["Pclass", "Sex", "Cabin", "Embarked"],
                                dtype = int
                               )

test_df_dummy = pd.get_dummies(data = test_df,
                               dummy_na  = True,
                               columns = ["Pclass", "Sex", "Cabin", "Embarked"],
                               dtype = int
                              )

print(train_df_dummy.shape)
print(test_df_dummy.shape)

# make sure test df has same columns as train df
cols = train_df_dummy.columns

for col in cols:
    if col not in test_df_dummy.columns:
        test_df_dummy[col] = 0

# reorder column orders
test_df_dummy = test_df_dummy[cols]
test_df_dummy = test_df_dummy.drop(columns = ["Survived"])

In [None]:
display(train_df_dummy)
display(test_df_dummy)

## 2.4 creat numpy arrays for training and testing

In [None]:
train_x = np.array(train_df_dummy)[:, 1:]
train_y = np.array(train_df_dummy)[:, 0]

print(train_x.shape)
print(train_y.shape)

In [None]:
test_x = np.array(test_df_dummy)
print(test_x.shape)

# 3. Explortary Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
print("number of people survived: ", np.sum(train_y))
print("number of people not survived: ", train_y.shape[0] - np.sum(train_y))


In [None]:
plt.figure(figsize = (10, 3.5), layout = 'constrained')

for i, col in enumerate(["Age", "SibSp", "Parch", "Fare"]):

    plt.subplot(1,4,i+1)
    sns.boxplot(x='Survived', y= col, data=train_df)

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=3,
                         figsize = (10, 3.5),
                         layout = 'constrained')

for i, col in enumerate(["Pclass", "Sex", "Embarked"]):
    
    sub_df_sum = train_df[["Survived",  col]].groupby([col], dropna = False).sum()
    sub_df_count = train_df[["Survived",  col]].groupby([col], dropna = False).count()
    sub_df_count = sub_df_count.rename(columns = {"Survived": "Count"})
    
    sub_df = sub_df_sum.merge(sub_df_count,
                              how = 'inner',
                              left_index = True,
                              right_index = True
                             )
    
    sub_df["Not Survived"] = sub_df["Count"] - sub_df["Survived"]  
    sub_df = sub_df.drop(columns = ["Count"])
    
    #display(sub_df)
    
    sub_df.plot(kind='bar', stacked=True, ax=axes[i])

plt.show()

# 4. Train & Predict

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

def display_cv_result(grid_search_cv):
    df = pd.DataFrame(grid_search_cv.cv_results_['params'])
    df['mean_test_score'] = grid_search_cv.cv_results_['mean_test_score']
    df = df.sort_values(by = ['mean_test_score'], ascending = [False])

    display(df)

cv_splitter = KFold(n_splits = 5, shuffle = True)


## 4.1 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RandomForest = RandomForestClassifier()

param = {'n_estimators': [i*10 for i in range(3,31)],
         'criterion': ["gini", "entropy", "log_loss"],
         'class_weight': [None, 'balanced'],
         'max_depth': [2,3,4,5]
        }

RandomForest_select = GridSearchCV(estimator = RandomForest,
                                   param_grid = param,
                                   scoring = "accuracy",
                                   cv =cv_splitter,
                                   return_train_score = True
                                  )

RandomForest_select.fit(train_x, train_y)

In [None]:
display_cv_result(RandomForest_select)

In [None]:
prediction = RandomForest_select.predict(test_x)

In [None]:
pred_df = pd.DataFrame({"PassengerId": test_PassengerID,
                        "Survived": list(prediction)})

pred_df["Survived"] = pred_df["Survived"].astype(int)

display(pred_df)

pred_df.to_csv('/kaggle/working/titanic-Random Forest.csv', index = False)

## 4.2 XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

param = {'n_estimators': [i*10 for i in range(3,31)],
         'max_depth': [2,3,4,5],
         'tree_method': ['exact','hist'],
         'grow_policy': ['lossguide', 'depthwise'],
         'booster': ['gbtree', 'gblinear', 'dart'],
         'device':['cuda']
        }

xgb_select = GridSearchCV(estimator = xgb,
                          param_grid = param,
                          scoring = "accuracy",
                          cv =cv_splitter,
                          return_train_score = True
                         )

xgb_select.fit(train_x, train_y)

In [None]:
display_cv_result(xgb_select)

In [None]:
prediction = xgb_select.predict(test_x)

pred_df = pd.DataFrame({"PassengerId": test_PassengerID,
                        "Survived": list(prediction)})

pred_df["Survived"] = pred_df["Survived"].astype(int)

display(pred_df)

pred_df.to_csv('/kaggle/working/titanic-xgboost.csv', index = False)

## 4.3 SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()

param = {'C': [i*0.5 for i in range(2,9)],
         'kernel': ['rbf','linear'],
         'class_weight':[None, 'balanced']
        }

svm_select = GridSearchCV(estimator = svm,
                          param_grid = param,
                          scoring = "accuracy",
                          cv =cv_splitter,
                          return_train_score = True
                         )

svm_select.fit(train_x, train_y)

display_cv_result(svm_select)

In [None]:
prediction = svm_select.predict(test_x)

pred_df = pd.DataFrame({"PassengerId": test_PassengerID,
                        "Survived": list(prediction)})

pred_df["Survived"] = pred_df["Survived"].astype(int)

display(pred_df)

pred_df.to_csv('/kaggle/working/titanic-svm.csv', index = False)