# Importing Data

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')

y = data.Survived
data.drop(['Survived'], axis = 1, inplace = True)

# Simple Data Cleaning and Organization

In [2]:
categorical_cols = [cname for cname in data.columns if
                    data[cname].dtype == object and
                    data [cname].nunique() < 10]

numerical_cols = [cname for cname in data.columns if
                  data[cname].dtype in ['int64', 'float64']]

In [3]:
my_cols = categorical_cols + numerical_cols

X_train = data[my_cols].copy()
X_test = data_test[my_cols].copy()

# Preprocessing and Pipelines 

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy = 'median', add_indicator = True)

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

proc_X_train = preprocessor.fit_transform(X_train)

# Modeling

In [5]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier

models = []
models.append(('LR', LogisticRegression(solver = 'liblinear', random_state = 0)))
models.append(('SVM', SVC(random_state = 0)))
models.append(('RFC', RandomForestClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0)))
models.append(('XGBRF', XGBRFClassifier(random_state = 0)))

In [6]:
for name, model in models:
    kfold = model_selection.KFold(n_splits = 10)
    cv_results = model_selection.cross_val_score(model, proc_X_train, y, cv = kfold, scoring = 'accuracy')
    print(name, ': ', cv_results.mean())

LR :  0.7912484394506867
SVM :  0.6465543071161048
RFC :  0.8137578027465666
XGB :  0.7924594257178528
XGBRF :  0.8204868913857679


# XGBoost Random Forest Classifier

In [7]:
from sklearn.model_selection import GridSearchCV

xgbrfc = XGBRFClassifier(random_state = 0)
params = [{'n_estimators': [140, 150, 160, 170],
          'subsample': [0.9, 0.8, 0.7, 0.6],
          'colsample_bynode': [0.5, 0.75, 0.1]}]

gs_xgbrfc = GridSearchCV(xgbrfc, 
                         param_grid = params, 
                         scoring = 'accuracy', 
                         cv = 5, 
                         verbose  = 1)

gs_xgbrfc.fit(proc_X_train, y)
print('Best Params: ', gs_xgbrfc.best_params_)
print('Best Score: ', gs_xgbrfc.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Params:  {'colsample_bynode': 0.75, 'n_estimators': 140, 'subsample': 0.8}
Best Score:  0.8226853304877283


In [8]:
xgbrfc = XGBRFClassifier(n_estimators = 140, 
                         subsample = 0.8, 
                         colsample_bynode = 0.75, 
                         random_state = 0)

# Prediction

In [9]:
classifier = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', xgbrfc)
])

classifier.fit(X_train, y)

predictions = classifier.predict(X_test)

In [10]:
output = pd.DataFrame({'PassengerId' : X_test.PassengerId,
                      'Survived' : predictions})
output.to_csv('submission.csv', index = False)