# Introduction

This notebook is for beginners.

**Sorry but there are few explanations yet, so I will write more.**

In [None]:
# reference
# https://github.com/ghmagazine/kagglebook/blob/master/ch01/ch01-01-titanic.py

In [None]:
import numpy as np
import pandas as pd

# Read training data and test data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train

In [None]:
test

# Profiling

In [None]:
import pandas_profiling

In [None]:
train.profile_report()

In [None]:
# Divide the training data into features and objective variables
train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

In [None]:
# The test data is only features, so you can leave it as it is.
test_x = test.copy()

# Create feature

In [None]:
# remove the variable PassengerId
train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)
# remove variables Name, Ticket, and Cabin
train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to each categorical variable
le = LabelEncoder()
for c in ['Sex', 'Embarked']:
    le.fit(train_x[c].fillna('NA'))    
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

In [None]:
train_x.head()

# Create model

In [None]:
from xgboost import XGBClassifier

In [None]:
# Create model and learn with training data
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

In [None]:
# Output the predicted value of test data with probability
pred = model.predict_proba(test_x)[:, 1]

In [None]:
# Convert the predicted value of test data to binary
pred_label = np.where(pred > 0.5, 1, 0)

In [None]:
# submission
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_00.csv', index=False)
submission

# Validation

In [None]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

In [None]:
# cross validation
scores_accuracy = []
scores_logloss = []
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # divide into training data and validation data
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    # train
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)
    # predict
    va_pred = model.predict_proba(va_x)[:, 1]
    # Calculate the score in the validation data
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    # Save the scores
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

In [None]:
print(scores_logloss)
print(scores_accuracy)

In [None]:
# Output the average score of each fold
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')

# Model tuning

In [None]:
import itertools

In [None]:
# Prepare tuning candidate parameters
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

In [None]:
# Combination of hyperparameters to search
param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

params = []
scores = []
# Cross-validation for each combination of parameters
for max_depth, min_child_weight in param_combinations:
    score_folds = []
    # cross validation
    kf = KFold(n_splits=4, shuffle=True, random_state=123456)
    for tr_idx, va_idx in kf.split(train_x):
        # Divide into training data and validation data
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        # train
        model = XGBClassifier(n_estimators=20, random_state=71,
                              max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)
        # predict
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        score_folds.append(logloss)
    # Average the score for each fold
    score_mean = np.mean(score_folds)
    # Save the combination of parameters and the score of it
    params.append((max_depth, min_child_weight))
    scores.append(score_mean)

In [None]:
# Choose the parameter with the best score
best_idx = np.argsort(scores)[0]
best_param = params[best_idx]
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')

# Create features for logistic regression

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train_x2 = train.drop(['Survived'], axis=1)
test_x2 = test.copy()

train_x2 = train_x2.drop(['PassengerId'], axis=1)
test_x2 = test_x2.drop(['PassengerId'], axis=1)
train_x2 = train_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x2 = test_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
# one-hot encoding
cat_cols = ['Sex', 'Embarked', 'Pclass']
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(train_x2[cat_cols].fillna('NA'))

In [None]:
# Create a column of one-hot encoding
ohe_columns = []
for i, c in enumerate(cat_cols):
    ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]

ohe_columns

In [None]:
# one-hot encoding
ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns)
# remove unnecessary old variables
train_x2 = train_x2.drop(cat_cols, axis=1)
test_x2 = test_x2.drop(cat_cols, axis=1)
# Join columns of one-hot encoding
train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1)
test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1)

In [None]:
train_x2.head()

In [None]:
test_x2.head()

In [None]:
train_x2.isnull().sum()

In [None]:
# replace missing values in numeric variables with mean values
# num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
num_cols = ['Age', 'Fare']
for col in num_cols:
    train_x2[col].fillna(train_x2[col].mean(), inplace=True)
    test_x2[col].fillna(train_x2[col].mean(), inplace=True)
    
train_x2.isnull().sum()

In [None]:
test_x2.head()

In [None]:
!pip install ptitprince

In [None]:
from ptitprince import RainCloud
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
RainCloud(data=train_x2, y='Fare', orient='h')
ax.grid()

In [None]:
# change the variable Fare x to log(x+1)
train_x2['Fare'] = np.log1p(train_x2['Fare'])
test_x2['Fare'] = np.log1p(test_x2['Fare'])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
RainCloud(data=train_x2, y='Fare', orient='h')
ax.grid()

# Ensemble

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# xgboost
model_xgb = XGBClassifier(n_estimators=20, random_state=71)
model_xgb.fit(train_x, train_y)
pred_xgb = model_xgb.predict_proba(test_x)[:, 1]
# logistic regression model
# Since the features are different from the xgboost, we created train_x2 and test_x2.
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)
pred_lr = model_lr.predict_proba(test_x2)[:, 1]

In [None]:
# Take a weighted average of the predicted values
pred = pred_xgb * 0.8 + pred_lr * 0.2
pred_label = np.where(pred > 0.5, 1, 0)

In [None]:
# submission
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission

In [None]:
# submission
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_01.csv', index=False)
submission