In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

data = pd.read_csv("../../data/raw/train.csv")

# drop lines cause of missing data for embarked
data = data.drop([61, 829])

In [None]:
## generate features
# interactions of categorials
interaction = data['Sex'].map(str) + '_' + data['Embarked'].map(str)
data = pd.concat([data, interaction], axis=1)
data.rename(columns={0: 'Sex_Embarked'}, inplace=True)

In [None]:
## split and preprocess data
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = data.Survived
X = data.drop(['Survived'], axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
## select features
features = X_train.columns

features = features.drop('Name') # cat
features = features.drop('Ticket') # cat
features = features.drop('Cabin') # cat
features = features.drop('PassengerId')
# features = features.drop('SibSp')
# features = features.drop('Parch')
features = features.drop('Embarked') # cat
# features = features.drop('Fare')
# features = features.drop('Age')
# features = features.drop('Pclass')
# features = features.drop('Sex') # cat
features = features.drop('Sex_Embarked') # cat, new

print(features)

In [None]:
## pipeline for preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [cname for cname in X_train[features].columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train[features].columns if X_train[cname].dtype in ['int64', 'float64']]

# preprocessing for numerical data
numerical_transformer = SimpleImputer( add_indicator = True )

# preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [None]:
## define model 
from xgboost import XGBClassifier

model = XGBClassifier( n_estimators = 1000, learning_rate = 0.05, random_state=0)
usePipeline = False # Don't use pipeline for XGBoost models (I couldn't figure out how to include early_stopping_rounds parameter and eval_set into pipeline)

In [None]:
## define pipeline and train model
if( usePipeline ):
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipe.fit(X_train[features], y_train)

    preds = pipe.predict(X_valid[features])

In [None]:
## feature selector to find the important features
from sklearn.feature_selection import SelectKBest, f_classif

X_train_select = preprocessor.fit_transform( X_train[features] )

selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(X_train_select, y_train)
selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=X_train.index)

selected_features.head()

In [None]:
## Preprocessing and model training outside of pipeline
#print( preprocessor )
if( usePipeline == False ):
    # preprocessing for training data:
    X_train_transformed = preprocessor.fit_transform( X_train[features] )
    # preprocessing for validation data:
    X_valid_transformed = preprocessor.transform( X_valid[features] )

    # Train model:
    model.fit( X_train_transformed, y_train, early_stopping_rounds=1000, eval_set=[(X_valid_transformed, y_valid)], verbose = False )
    preds = model.predict( X_valid_transformed )
    print( "Best iteration: {}".format(model.best_iteration))
    print( "Best #estimators: {}".format(model.best_ntree_limit))

In [None]:
## evaluate model
from sklearn import metrics

score = metrics.accuracy_score(y_valid, preds)
print(f"Validation score: {score: .4f}")

In [None]:
preds = pd.DataFrame(preds)
preds.describe()