In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

from datetime import datetime
import pickle

##### Load datasets

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df_train.head(4)

In [None]:
df_train.describe()

In [None]:
# Count missing value
list(zip(df_test.columns, df_train.drop('Survived', axis=1).isna().sum().values, df_test.isna().sum().values))

In [None]:
df_train['Embarked'].value_counts()

In [None]:
df_train.info()

In [None]:
df_train_id = df_train['PassengerId']
df_test_id = df_test['PassengerId']

In [None]:
def create_submission(y_pred, file_name):
    d = {'PassengerId': df_test_id, 'Survived': y_pred}
    submission_file = pd.DataFrame(d)
    submission_file.to_csv(file_name,  sep=',',  index=False)

#### Separate features and label variable

In [None]:
X = df_train.drop('Survived', axis=1)
y = df_train.Survived

X_test = df_test.copy(deep=True)

### Plot graphics

In [None]:
sns.pairplot(X.drop('PassengerId', axis=1))

#### Missing value

In [None]:
sns.heatmap(X.drop('PassengerId', axis=1).isna(), cbar=False, cmap='viridis', yticklabels='False')

#### Pearson Correlation Heatmap

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# Create the correlation matrix
corr = X.select_dtypes(include=numerics).astype(float).corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# plot graph
colormap = plt.cm.RdBu
plt.figure(figsize=(10,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)

sns.heatmap(corr, linewidths=0.1, vmax=1.0, mask=mask,
            square=True, cmap=colormap, linecolor='white', annot=True)

### Split into train and validation

In [None]:
# Get validation data
X_train, X_validation, y_train, y_validation = train_test_split(X, y,
                                                                test_size=0.20, 
                                                                random_state=42)

X_train.shape, X_validation.shape, X_test.shape

In [None]:
y_validation.shape, y_validation.sum()

### Features engineering

In [None]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.startswith(substring):
            return substring
    return 'NA'

In [None]:
X_all = [X_train, X_validation, X_test, X]

median_age = X_train.loc[:,'Age'].median()
median_fare = X_train.loc[:, 'Fare'].median()
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'NA']
# Specify the boundaries of the bins
bins_age = [-np.inf, 20, 40, 60, 80, np.inf]
# Bin labels
labels_age = ['Age_1', 'Age_2','Age_3', 'Age_4', 'Age_5']

pd.options.mode.chained_assignment = None  # default='warn'
for X_ in X_all:
    # Imputing
    # Imputing Age by median value
    X_.loc[:,'Age'].fillna(median_age, inplace=True)

    # Imputing Fare in test set by median value
    X_.loc[:, 'Fare'].fillna(median_fare, inplace=True)

    # Imputing Embarked and Cabin by 'NA' value
    X_.loc[:, 'Embarked'].fillna('NA', inplace=True)
    X_.loc[:, 'Cabin'].fillna('NA', inplace=True)
    
    
    # Add new features
    # Calculating FamilySize
    X_.loc[:, 'FamilySize'] = X_['SibSp'] + X_['Parch']
    
    # Deck
    # This is going be very similar, we have a ‘Cabin’ column not doing much, only 1st class passengers have cabins, 
    # the rest are ‘Unknown’. A cabin number looks like ‘C123’. The letter refers to the deck, 
    # and so we’re going to extract these just like the titles.
    # Turning cabin number into Deck    
    X_.loc[:, 'Deck'] = X_['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))
    
    # Age*Class
    # This is an interaction term, since age and class are both numbers we can just multiply them.
    X_.loc[:, 'Age*Pclass'] = X_['Age'] * X_['Pclass']
    
    # Fare per Person
    # Here we divide the fare by the number of family members traveling together, 
    X_.loc[:, 'FarePerPerson'] = X_['Fare'] / (X_['FamilySize'] + 1)
    
    # Specify Age
    # Bin the continuous variable using these boundaries
    X_.loc[:, 'Age_name'] = pd.cut(X_['Age'], bins=bins_age, labels=labels_age).astype('object')
    
     #
    X_.loc[:, 'LastName_len'] = X_['Name'].str.split(', ').str[0].str.len()
    X_.loc[:, 'FirstName_len'] = X_['Name'].str.split(', ').str[1].str.len()
    
    #
    X_.loc[:, 'Family_size_group'] = X_['FamilySize'].map(lambda x: 'f_single' if x == 0
                                                         else('f_usual' if 4 > x >= 1
                                                             else('f_big' if 7 > x >= 4
                                                                 else('f_large'))))
    
#     # Is alone
#     X_['IsAlone'] = 0
#     X_.loc[X_['FamilySize'] == 0, 'IsAlone'] = 1
    
    # Delete unuse columns
    X_.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1, inplace=True)

pd.options.mode.chained_assignment = 'warn'   

In [None]:
imput_median_columns = ['Age', 'Fare']
imput_median_indices = np.array([(column in imput_median_columns) for column in X_train.columns], dtype = bool)

In [None]:
X_train.head(4)

### Select columns for transformation

In [None]:
binary_data_columns = ['SibSp', 'Parch', 'FamilySize']
binary_data_indices = np.array([(column in binary_data_columns) for column in X_train.columns], dtype = bool)

In [None]:
print(binary_data_columns)
print(binary_data_indices)

In [None]:
# Columns for One-Hot-Encoder
categorical_data_columns = ['Pclass', 'Sex','Embarked', 'Deck',  'Age_name', 'Family_size_group'] 
categorical_data_indices = np.array([(column in categorical_data_columns) for column in X_train.columns], dtype = bool)

In [None]:
print(categorical_data_columns)
print(categorical_data_indices)

In [None]:
# Columns for Standardization
numeric_data_columns = ['Age', 'LastName_len', 'FirstName_len']
numeric_data_indices = np.array([(column in numeric_data_columns) for column in X_train.columns], dtype = bool)

In [None]:
print(numeric_data_columns)
print(numeric_data_indices)

In [None]:
# Apply a power transform featurewise to make data more Gaussian-like
numeric_data_columns_log = ['Fare', 'Age*Pclass', 'FarePerPerson']
numeric_data_indices_log = np.array([(column in numeric_data_columns_log) for column in X_train.columns], dtype = bool)

In [None]:
print(numeric_data_columns_log)
print(numeric_data_indices_log)

### Models

##### Multiple feature extraction

In [None]:
combined_features = FeatureUnion(transformer_list = [
            # binary
            ('binary_variables_processing', FunctionTransformer(lambda data: data.iloc[:, binary_data_indices])), 
                    
            # numeric
            ('numeric_variables_processing', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data.iloc[:, numeric_data_indices])),
                ('scaling', StandardScaler(with_mean=True))            
                        ])),
            # numeric_log
            ('numeric_variables_log_processing', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data.iloc[:, numeric_data_indices_log])),
                ('scaling', PowerTransformer(standardize=True))            
                        ])),
        
            # categorical
            ('categorical_variables_processing', Pipeline(steps = [
                ('selecting', FunctionTransformer(lambda data: data.iloc[:, categorical_data_indices])),
                ('hot_encoding', OneHotEncoder(handle_unknown = 'ignore'))            
                        ])),
        ],
                                n_jobs = -1)

#### DecisionTreeModel

In [None]:
tree_class = DecisionTreeClassifier(random_state=42)

In [None]:
estimator_tree = Pipeline(steps = [
    ('feature_processing', combined_features),
    ('classifier', tree_class)
])

In [None]:
%%time

grid_param = {
    'classifier__max_depth': np.arange(2, 5, dtype=int),
    'classifier__min_samples_leaf':  np.arange(2, 5, dtype=int)
}

gsearch_tree = GridSearchCV(estimator = estimator_tree,
                              param_grid = grid_param,
                              scoring='accuracy',
                              return_train_score=True,
                              n_jobs=-1,
                              verbose=2,
                              cv=5)

gsearch_tree.fit(X_train, y_train)

In [None]:
gsearch_tree.best_params_, gsearch_tree.best_score_

In [None]:
# Validation
y_pred_train_tree = gsearch_tree.best_estimator_.predict(X_train)
y_pred_val_tree = gsearch_tree.best_estimator_.predict(X_validation)
accuracy_val = accuracy_score(y_validation, y_pred_val_tree)
print('accuracy_val: {}'.format(accuracy_val))

In [None]:
# Save submission file
time_now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
y_pred_test_tree = gsearch_tree.best_estimator_.predict(X_test)
create_submission(y_pred_test_tree, 'submissions/tree_gridSearch_submission_{}.csv'.format(time_now))

#### XGB Classifier

In [None]:
xgb_class = xgb.XGBClassifier(objective='binary:logistic',
                              eval_metric='logloss',
                              use_label_encoder=False,
                              nthread=-1,
                              seed=42)

In [None]:
estimator_xgb = Pipeline(steps = [
    ('feature_processing', combined_features),
    ('classifier', xgb_class)
])

In [None]:
%%time
grid_param = {
    'classifier__learning_rate': [0.01, 0.1, 0.2, 0.5],
    'classifier__n_estimators': range(80, 301, 20),
    'classifier__max_depth': range(3,8),
    'classifier__min_child_weight': range(2,6),
    'classifier__learning_rate': np.arange(0.01, 0.505, 0.05),
    'classifier__gamma': np.arange(0.1, 0.805, 0.1),
    'classifier__subsample': np.arange(0.7, 1.0, 0.1),
    'classifier__colsample_bytree': np.arange(0.6, 1.0, 0.1)
}

rsearch_xgb = RandomizedSearchCV(estimator=estimator_xgb,
                                 n_iter=50,
                                 param_distributions=grid_param,
                                 scoring='accuracy',
                                 return_train_score=True,
                                 n_jobs=-1,
                                 verbose=2,
                                 random_state=42,
                                 cv=4)
rsearch_xgb.fit(X_train, y_train)

In [None]:
rsearch_xgb.best_params_, rsearch_xgb.best_score_

In [None]:
# Validation
y_pred_train_xgb = rsearch_xgb.best_estimator_.predict(X_train)
y_pred_val_xgb = rsearch_xgb.best_estimator_.predict(X_validation)
accuracy_val = accuracy_score(y_validation, y_pred_val_xgb)
print('accuracy_val: {}'.format(accuracy_val))

In [None]:
# Save submission file
time_now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
y_pred_test_xgb = rsearch_xgb.best_estimator_.predict(X_test)
create_submission(y_pred_test_xgb, 'submissions/xgb_randomSearch_submission_{}.csv'.format(time_now))

#### Logistic regression classifier

In [None]:
logreg = LogisticRegression(random_state=42,
                            max_iter=200,
                            n_jobs=-1)

In [None]:
estimator_logreg = Pipeline(steps = [
    ('feature_processing', combined_features),
    ('classifier', logreg)
])

In [None]:
%%time
grid_param = {
    'classifier__C': np.logspace(-5, 8, 15)
}

gsearch_logreg = GridSearchCV(estimator = estimator_logreg,
                              param_grid = grid_param,
                              scoring='accuracy',
                              return_train_score=True,
                              n_jobs=-1,
                              verbose=2,
                              cv=5)

gsearch_logreg.fit(X_train, y_train)

In [None]:
gsearch_logreg.best_params_, gsearch_logreg.best_score_

In [None]:
# Validation
y_pred_train_lr = gsearch_logreg.best_estimator_.predict(X_train)
y_pred_val_lr = gsearch_logreg.best_estimator_.predict(X_validation)
accuracy_val = accuracy_score(y_validation, y_pred_val_lr)
print('accuracy_val: {}'.format(accuracy_val))

In [None]:
gsearch_logreg.cv_results_['params'][0], gsearch_logreg.cv_results_['mean_test_score'][0]

In [None]:
# Save submission file
time_now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
y_pred_test_lr = gsearch_logreg.best_estimator_.predict(X_test)
create_submission(y_pred_test_lr, 'submissions/logreg_gs_submission_{}.csv'.format(time_now))

#### SVM

In [None]:
svm = SGDClassifier(random_state=42,
                    n_jobs=-1,
                    early_stopping=True,
                    validation_fraction=0.2,
                    n_iter_no_change=10,
                    verbose=2)

In [None]:
estimator_svm = Pipeline(steps = [
    ('feature_processing', combined_features),
    ('classifier', svm)
])

In [None]:
%%time
grid_param = {
    'classifier__alpha': np.logspace(-4, 1, 10),
    'classifier__loss': ['hinge', 'modified_huber', 'squared_hinge', 'perceptron'] 
}

gsearch_svm = GridSearchCV(estimator = estimator_svm,
                           param_grid = grid_param,
                           scoring='accuracy',
                           return_train_score=True,
                           n_jobs=-1,
                           verbose=2,
                           cv=4)

gsearch_svm.fit(X_train, y_train)

In [None]:
gsearch_svm.best_params_, gsearch_svm.best_score_

In [None]:
# Validation
y_pred_train_svm = gsearch_svm.best_estimator_.predict(X_train)
y_pred_val_svm = gsearch_svm.best_estimator_.predict(X_validation)
accuracy_val = accuracy_score(y_validation, y_pred_val_svm)

print('accuracy_val: {}'.format(accuracy_val))

In [None]:
# Save submission file
time_now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
y_pred_test_svm = gsearch_svm.best_estimator_.predict(X_test)
create_submission(y_pred_test_svm, 'submissions/svm_gs_submission_{}.csv'.format(time_now))

In [None]:
# Save the model to disk
pickle.dump(gsearch_svm.best_estimator_.named_steps['classifier'], \
            open('model/svm_grid_search_{}.sav'.format(time_now), 'wb'))

### Plot features importances from XGBoost

In [None]:
feature_important = rsearch_xgb.best_estimator_.named_steps['classifier'].get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

print(rsearch_xgb.best_estimator_.named_steps['classifier'].get_booster().feature_names)

print(X_train.columns)

In [None]:
feat_imp = pd.Series(feature_important).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

# Ensemble of classifiers

In [None]:
%%time

# Define the list classifiers
classifiers=[('dt', gsearch_tree.best_estimator_), 
             ('xgb', rsearch_xgb.best_estimator_), 
             ('lr', gsearch_logreg.best_estimator_), 
             ('svm', gsearch_svm.best_estimator_),]

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers, voting='hard', n_jobs=-1) 

# Fit vc to the training set
vc.fit(X_train, y_train)

In [None]:
# Validation
y_pred_train_vc = vc.predict(X_train)
y_pred_val_vc = vc.predict(X_validation)
accuracy_val = accuracy_score(y_validation, y_pred_val_vc)

print('accuracy_val: {}'.format(accuracy_val))

In [None]:
# Log info
logger.info('VotingClassifier')
logger.warning('accuracy_val: {}'.format(accuracy_val))

In [None]:
# Save submission file
time_now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
y_pred_test_vc = vc.predict(X_test)
create_submission(y_pred_test_vc, 'submissions/vc_submission_{}.csv'.format(time_now))