#### 단계적으로 EDA를 통해 전처리와 모델 정규화를 진행 예정입니다.
#### 이번 코드는 과연 최소한의 작업으로 얼마 정도의 성과가 나올수 있는지 확인해 보겠습니다.
#### 다음에는 상위권 결과가 나올수 있도록 하겠습니다.

#### I plan to proceed with preprocessing and model normalization through EDA in stages.
#### Let's check how much performance can be achieved with this code with the least amount of work.
#### In Second, I will make sure that the top results come out.

### 1. 모듈 불러오기 (Import module)
 
#### 제가 가장 자주 쓰는 모듈들을 불러올 예정입니다.
#### It will load all the modules I use the most.

In [None]:
import seaborn as sns
import sys
import csv
import datetime
import operator
import joblib
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from statsmodels.formula.api import ols
from sklearn.metrics import cohen_kappa_score
from collections import OrderedDict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import norm, skew, probplot

### 2. 데이터 불러오기(Read Dataset)

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

### 3. 데이터 전처리(Pretreatment)

#### 이미 많은 분들이 EDA를 하여 변수에 대한 분석이 되어 있는바, 이부분은 제외하고 진행하겠습니다.
#### A lot of people have already done EDA to analyze the variables, so I will exclude this part.

In [None]:
df_train.isna().sum()

In [None]:
df_train=df_train.drop(['Name','Ticket','Cabin'],axis=1)
df_test=df_test.drop(['Name','Ticket','Cabin'],axis=1)

In [None]:
embarked_mode = df_train['Embarked'].mode()
df_train['Embarked'] = df_train['Embarked'].fillna(embarked_mode)
df_test['Embarked'] = df_test['Embarked'].fillna(embarked_mode)

df_train['Age'] = df_train['Age'].fillna(-1)
df_test['Age'] = df_test['Age'].fillna(-1)

In [None]:
df_train = pd.get_dummies(df_train)
test = pd.get_dummies(df_test)

In [None]:
df_train.info()

In [None]:
# 향후 train Data가 필요할 것이기에 df(data file)에 할당하였습니다.
# Since we will need train data in the future, we allocated it to df (data file)
df = df_train

In [None]:
random_state_val =42
test_size_val =0.01
df_trval,df_test = train_test_split(df, test_size = test_size_val, random_state = random_state_val)

test_size_val = 0.2
df_train, df_val = train_test_split(df_trval, test_size = test_size_val, random_state = random_state_val)

In [None]:
drop_col = ['Survived']
y_nm = 'Survived'

df_train_x = df_train.drop(drop_col, axis = 1)
df_train_y = pd.DataFrame(df_train[y_nm])

df_val_x = df_val.drop(drop_col, axis = 1)
df_val_y = pd.DataFrame(df_val[y_nm])

df_test_x = df_test.drop(drop_col, axis = 1)
df_test_y = pd.DataFrame(df_test[y_nm])

In [None]:
XGBClassifier = xgb.XGBClassifier(max_depth = 9,
                                 learning_rate = 0.001,
                                 n_estimators = 5000,
                                 objective = 'binary:logistic',
                                 booster = 'gbtree',
                                 gamma = 0.04,
                                 max_delta_step =5,
                                 min_child_weight = 2.8189,
                                 subsample = 0.8104,
                                 colsample_bytree = 0.6332
                                 )

In [None]:
start = datetime.datetime.now()
xgb = XGBClassifier.fit(df_train_x.values,
                       df_train_y.values.ravel(),
                       eval_set = [(df_train_x.values, df_train_y), (df_val_x.values, df_val_y)],
                       eval_metric ='logloss',
                       early_stopping_rounds = 10,
                       verbose = False)
end = datetime.datetime.now()
end-start

In [None]:
fi_vals = xgb.get_booster().get_score(importance_type = 'weight')
fi_dict = {df_train_x.columns[i]:float(fi_vals.get('f'+str(i),0.)) for i in range(len(df_train_x.columns))}
feature_importance_ = sorted(fi_dict.items(), key=operator.itemgetter(1), reverse=True)
feature_importance_result = OrderedDict(feature_importance_)

importance = pd.DataFrame(feature_importance_)
importance.columns = ['feature','weight']
importance.head(10)

In [None]:
importance_ten = importance[:10]
importance_ten.set_index('feature').sort_values(by='weight').plot(kind='barh', figsize=(5, 5))

In [None]:
result_lst =[]
max_accuracy =0.
opt_threshold =0.
val_y_prob = xgb.predict_proba(df_val_x.values)[:, 1]

for n in range(0,50):
    threshold = round(((n+1)*0.01),2)
    pred_yn = val_y_prob.copy()
    pred_yn = np.where(pred_yn > threshold, 1., 0.)
    
    result_dict = {}
    precision, recall, f1_score, support = precision_recall_fscore_support(df_val_y.values.ravel(), pred_yn, average='binary')
    accuracy = accuracy_score(df_val_y.values.ravel(), pred_yn)
    kappa = cohen_kappa_score(df_val_y.values.ravel(), pred_yn)
    
    result_dict ={'Threshold': threshold, 'Accuracy': round(accuracy,4), 'Precision': round(precision,4), 'Recall': round(recall,4), 'F1_Score': round(f1_score,4), 'Kappa': round(kappa,4)}
    result_lst.append(result_dict)
    
    if max_accuracy <= accuracy:
        max_accuracy = accuracy
        opt_threshold = threshold
        
    confMat = confusion_matrix(df_val_y.values.ravel(), pred_yn, labels=[1,0])
    
matric_df = pd.DataFrame(result_lst, columns=['Threshold','Accuracy', 'Precision', 'Recall', 'F1_Score', 'Kappa'])
matric_df.to_csv('REC_scores.csv',sep=',', header=True, index=False, encoding='UTF-8')

print('Max Accuracy =%f, optimized_threshold=%f'%(max_accuracy, opt_threshold))
print('Complete')

In [None]:
predict_xgb = xgb.predict_proba(df_train_x.values)[:,1]
pred_train = np.where(predict_xgb > opt_threshold, 1., 0.)

tp, fn, fp, tn = confusion_matrix(df_train_y.values.ravel(), pred_train, labels=[1,0]).ravel()

In [None]:
conf_matrix = pd.DataFrame(
    confusion_matrix(df_train_y.values.ravel(), pred_train),
    columns=['Predicted Value 0', 'Predicted Value 1'],
    index=['True Value 0', 'True Value 1']
)

print("1. Counfusion Matrix")
print(conf_matrix.T)
print("")

print("2. Classification Report")
print(classification_report(df_train_y.values.ravel(), pred_train))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(df_train_y.values.ravel(), predict_xgb)

import matplotlib.pyplot as plt
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
Accuracy_Rate = (tp + tn) / (tp + tn + fp + fn)
Recall_Rate = tp / (tp + fn)
Precision_Rate = tp / (tp + fp)
Specificity_Rate = tn / (tn + fp)
F1_Score = (Precision_Rate * Recall_Rate) / (Precision_Rate + Recall_Rate) * 2

print("3. Model Metric Sumamry")
print(" - Accuracy Rate    : {:2.3f} %".format(Accuracy_Rate*100))
print(" - Recall Rate      : {:2.3f} %".format(Recall_Rate*100))
print(" - Precision Rate   : {:2.3f} %".format(Precision_Rate*100))
print(" - Specificity Rate : {:2.3f} %".format(Specificity_Rate*100))
print(" - F1 Score         : {:2.3f} ".format(F1_Score*100))
print(" - ROC AUC          : {:2.3f} ".format(roc_auc*100))

In [None]:
predict_xgb = xgb.predict_proba(test.values)[:,1]
pred_test = np.where(predict_xgb > opt_threshold, 1., 0.)

test_result= pd.DataFrame(pred_test)
test_result.columns = ['Survived']
predict = test_result['Survived']
Id_No = test['PassengerId']
submission = pd.DataFrame({'PassengerId': Id_No, "Survived": predict})
submission['Survived'] = submission['Survived'].astype('Int64')
submission.to_csv('submission.csv', index=False)
submission.head()