In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.head()

We will choose the features and target data

In [None]:
pd.DataFrame(train).nunique(axis=0)

In [None]:
train.shape[1]

In [None]:
test_x = test.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

# we will drop some data / columns that seem insignificant
# drop the PassengerID variables
# Drop the Name, Ticket & Cabin variables
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# apply label encoding to categorical variables
for c in ['Sex', 'Embarked']:
    # fit the labels using the training data
    le = LabelEncoder()
    le.fit(train[c].fillna('NA'))
    
    # return the encoded labels for the training and test data
    train[c] = le.transform(train[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

What percentage are unique values within their rows?

In [None]:
for i in range(train.shape[1]):
    num = len(np.unique(train.iloc[:, i]))
    percent = float(num) / train.shape[0] * 100
    print('%d, %d, %.1f%%' % (i, num, percent))

Find duplicates

In [None]:
dups = train.duplicated()
print(dups.any())
print(train[dups])

Delete dups

In [None]:
#print shape before deleting dups
print(train.shape)
train.drop_duplicates(inplace=True)
#print shape after deleting dups
print(train.shape)

In [None]:
# calculate summary statistics
data_mean, data_std = np.mean(train['Age']), np.std(train['Age'])

# define outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off

# identify outliers
outliers = [x for x in train['Age'] if x < lower or x > upper]
print('Number of outliers: %d' % len(outliers))

In [None]:
# calculate summary statistics
data_mean, data_std = np.mean(train['Fare']), np.std(train['Fare'])

# define outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off

# identify outliers
outliers = [x for x in train['Fare'] if x < lower or x > upper]
print('Number of outliers: %d' % len(outliers))

In [None]:
outliers[:10]

WOW. that's interesting that there 2217 outliers in the fare. And how high priced they are. It doesn't seem like we'd cut these. However it would be interesting to see how many survived. 

Perhaps we need to split the fares. 

In [None]:
train.sort_values("Fare", ascending = False).head(10)

In [None]:
train.sort_values("Fare", ascending = True).head(10)

In [None]:
# familyMembers
familyMembers = pd.DataFrame(train['Parch'] + train['SibSp'])
train = pd.DataFrame(pd.concat([train, familyMembers], axis=1))
train.rename(columns={0: 'familyMembers'}, inplace=True)

In [None]:
train.head()

In [None]:
# familyMembers
familyMembers = pd.DataFrame(test_x['Parch'] + test['SibSp'])
test_x = pd.DataFrame(pd.concat([test_x, familyMembers], axis=1))
test_x.rename(columns={0: 'familyMembers'}, inplace=True)

In [None]:
test.head()

In [None]:
# farePerFamily
farePerFamily = pd.DataFrame(train['Fare']/(train['familyMembers']+1))
train = pd.DataFrame(pd.concat([train, farePerFamily], axis=1))
train.rename(columns={0: 'farePerFamily'}, inplace=True)

In [None]:
train.head()

In [None]:
# farePerFamily
farePerFamily = pd.DataFrame(test_x['Fare']/(test_x['familyMembers']+1))
test_x = pd.DataFrame(pd.concat([test_x, farePerFamily], axis=1))
test_x.rename(columns={0: 'farePerFamily'}, inplace=True)

In [None]:
test_x.head()

In [None]:
train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

In [None]:
train_x.head()

In [None]:
test_x.head()

Create our Model. In this case we will do xgboost

In [None]:
from xgboost import XGBClassifier

# create a model and fit it to training data
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

# output predicted probabilities for the test data
pred = model.predict_proba(test_x)[:,1]

# convert into binary predications
pred_label = np.where(pred >0.5, 1, 0)

# create a submission file
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_first.csv', index=False)

If you submit the above you will get a 0.52224 score without any feature engineering

However, with the feature engineering above my score is: .78404

Cross Validation

Therefore, we will also output an index called logloss. Log loss is out of prediction probability
The higher the penalty, the better the index. Learn more about logloss
For example, "2.3.4 Evaluation index in binary classification-when the probability of being a positive example is used as the predicted value"


In [None]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

#create lists to store the scores for each fold
scores_accuracy = []
scores_logloss = []

# setup cross validation
# split the training data into 4
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    # split the training data into training and validation
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    # train the model
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)
    
    # output predictions probabilities for the validation data
    va_pred = model.predict_proba(va_x)[:,1]
    
    # calculate scores for the validation data
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    # store the scores for this fold
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)
    
# calculate the mean scores using all folds
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')

Tune the Model

The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

In [None]:
import itertools

# setup parameters
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
    
}

# try various hyperparameter combinations
param_combinations = itertools.product(param_space['max_depth'], 
                                       param_space['min_child_weight'])

# create lists to store scores for the hyperparameter combinations
params = []
scores = []

# perform cross validation for each hyperparameter combination

for max_depth, min_child_weight in param_combinations:
    
    score_folds = []
    # setup cross validation
    # split the training data into 4
    kf = KFold(n_splits=4, shuffle=True, random_state=42)
    for tr_idx, va_idx in kf.split(train_x):
        # split the training data into training and validation
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

        # train the model
        model = XGBClassifier(n_estimators=20, random_state=71,
                             max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)

        # output predictions probabilities for the validation data
        va_pred = model.predict_proba(va_x)[:,1]
        
        # calculate scores for the validation data
        logloss = log_loss(va_y, va_pred)
        # store the scores for this fold
        score_folds.append(logloss)
    
    # calculate the mean scores using all folds
    score_mean = np.mean(score_folds)

    # store the scores for the hyperparameter combinations
    params.append((max_depth, min_child_weight))
    scores.append(score_mean)
    
# set the parameters to the best values and highest score
best_idx = np.argsort(scores)[0] # look at what this does
best_param = params[best_idx]
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')

BEST: max_depth: 5, min_child_weight: 1.0

Setup to Ensemble

Let's run logistic regression now so that we can ensemble

In [None]:
train_x.head()

In [None]:
test_x.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Copy the above feature engineering
train_x2 = train_x
test_x2 = test_x

# setup one-hot encoding
cat_cols = ['Sex', 'Embarked', 'Pclass']
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(train_x2[cat_cols].fillna('NA'))

# create column names for dummy one-hot encoding variables
ohe_columns = []
for i, c in enumerate(cat_cols):
    ohe_columns +=[f'{c}_{v}' for v in ohe.categories_[i]]

In [None]:
ohe_columns

In [None]:
# Create DataFrames for one-hot encoding
ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns)

In [None]:
ohe_train_x2.head()

In [None]:
# drop original columns before one-hot encoding
train_x2 = train_x2.drop(cat_cols, axis=1)
test_x2 = test_x2.drop(cat_cols, axis=1)

# append the one-hot encoded columns
train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1)
test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1)

# replace missing values in columns with mean of values
num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
for col in num_cols:
    train_x2[col].fillna(train_x2[col].mean(), inplace=True)
    test_x2[col].fillna(train_x2[col].mean(), inplace=True)
    
# make a logarithmic transformation of the fare variables
# When taking the logarithm, it is normal to avoid negative divergence of the value when the true value is 0.
# Takes the logarithm after adding 1 as shown in the above equation. You can use numpy's log1p function

train_x2['Fare'] = np.log1p(train_x2['Fare'])
test_x2['Fare'] = np.log1p(test_x2['Fare'])

Ensemble XGB and Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# xgboost model
model_xgb = XGBClassifier(n_estimators=20, random_state=71)
model_xgb.fit(train_x, train_y)
pred_xgb = model_xgb.predict_proba(test_x)[:, 1]

# logistic regression model
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)
pred_lr = model_lr.predict_proba(test_x2)[:,1]

# take the weighted average of the predictions
pred_ens = pred_xgb * 0.8 + pred_lr * 0.2
pred_label_ens = np.where(pred > 0.5, 1, 0)                                 

In [None]:
# Create a submission file
submission_ens = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label_ens})
submission_ens.to_csv('submission_ens.csv', index=False)