In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
%matplotlib inline
sns.set() # enable seaborn style

In [2]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
# Custom Transform Class for Imputation
from sklearn.base import TransformerMixin
class MyTransformer(TransformerMixin): 
    '''Class with fit() and transform() methods'''

    def fit(self, X, y):
        '''
        fit() will compute and save the mean age per (pclass, sex)
        on the training data
        '''
        self.mean_age_by_pclass_sex = X.groupby(['Pclass', 'gender_male'])['Age'].mean()
        
        # return an instance of MyTransformer
        return self
    
    def transform(self, X):
        '''
        transform() is applied to both train and test data
        
        It will replace missing age values by looking up the
        mean age per (pclass,sex)
        '''
        
        # index into multi-indexed Pandas Series using 2-part key
        # Pandas does not allow indexing by DataFrame, so create list of tuples
        # each tuple is (pclass, sex)
        missing_age_idx = X['Age'].isnull()
        df = X.loc[missing_age_idx, ['Pclass', 'gender_male']]
        index_tuples = list(df.itertuples(index=False, name=None))
        
        # lookup mean age in DataFrameGroupBy using list of tuples of (pclass, sex)
        imputed_age_values = self.mean_age_by_pclass_sex.loc[index_tuples].values
        
        # replace the missing values with the imputed age values
        X = X.copy()
        X.loc[missing_age_idx, 'Age'] = imputed_age_values

        # drop the sex column as logicstic regression only works with numerics
        # (later we will encode it and use it)
        return X

In [4]:
# read in all the labeled data
all_data = pd.read_csv('./data/train.csv')

# drop string columns with too many values or too many nulls
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
all_data = all_data.drop(drop_cols, axis=1)

# drop rows with Embarked null
rows_to_keep = all_data['Embarked'].notnull()
all_data = all_data[rows_to_keep]

# one-hot encode Sex and Embarked
gender = pd.get_dummies(all_data['Sex'], drop_first=True, prefix='gender')
port = pd.get_dummies(all_data['Embarked'], drop_first=True, prefix='port')
all_data = pd.concat([all_data, gender, port], axis=1)

# remove encoded variables
all_data.drop(['Embarked', 'Sex'], axis=1, inplace=True)

# feature extraction: sibsp == 0
all_data['sibsp_zero'] = (all_data['SibSp'] == 0)

# feature extraction: parch == 0
all_data['parch_zero'] = (all_data['Parch'] == 0)

# feature extraction: sibsp == 0 & parch == 0
all_data['alone'] = (all_data['parch_zero'] & all_data['sibsp_zero'])

# feature extraction: quantize fare
def quantize_fare(fare):
    if fare < 45:
        return 0
    elif fare < 80:
        return 1
    else:
        return 2
    
all_data['fare_quantized'] = all_data['Fare'].apply(quantize_fare)
all_data['sibsp_zero'] = all_data['sibsp_zero'].astype('int')
all_data['parch_zero'] = all_data['parch_zero'].astype('int')
all_data['alone'] = all_data['alone'].astype('int')

# break up the dataframe into X and y
X = all_data.drop('Survived', axis=1)
y = all_data['Survived']

k_folds = 10
random_seed=5
crossvalidation = StratifiedKFold(n_splits=k_folds, shuffle=True, 
                        random_state=random_seed)

standard_scaler = StandardScaler()
my_transformer = MyTransformer()

In [5]:
y.describe()

count    889.000000
mean       0.382452
std        0.486260
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
Pclass            889 non-null int64
Age               712 non-null float64
SibSp             889 non-null int64
Parch             889 non-null int64
Fare              889 non-null float64
gender_male       889 non-null uint8
port_Q            889 non-null uint8
port_S            889 non-null uint8
sibsp_zero        889 non-null int64
parch_zero        889 non-null int64
alone             889 non-null int64
fare_quantized    889 non-null int64
dtypes: float64(2), int64(7), uint8(3)
memory usage: 72.1 KB


In [22]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=5)

classifier = make_pipeline(my_transformer, xgb_clf)

scores = cross_val_score(classifier, X, y, cv=crossvalidation, 
                         scoring='accuracy', n_jobs=7)

print(np.round(scores,3))
print(round(np.mean(scores),3))

[0.775 0.854 0.809 0.843 0.854 0.843 0.831 0.787 0.831 0.841]
0.827


In [29]:
# alternative
data = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"binary:logistic", "n_estimators":100, "random_state":5}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=data, params=params, nfold=10, num_boost_round=10, 
                    metrics="error", as_pandas=True, seed=5)

In [30]:
print(cv_results)

   test-error-mean  test-error-std  train-error-mean  train-error-std
0         0.191292        0.039013          0.138734         0.004732
1         0.193539        0.048984          0.130860         0.005836
2         0.198034        0.043321          0.125235         0.005260
3         0.196935        0.044960          0.120736         0.006025
4         0.195799        0.039147          0.118736         0.005803
5         0.196935        0.046342          0.116737         0.005534
6         0.187908        0.041162          0.115487         0.006174
7         0.187908        0.037971          0.113862         0.005154
8         0.185648        0.035909          0.112361         0.005490
9         0.186772        0.035530          0.109612         0.005544
