In [3]:
import pandas as pd

In [4]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [35]:
X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

<h3>1.</h3>
<pre>Add columns:
    multiple_cabin: '0' if passenger has only 1 cabin, '1' if more than 1
    numeric_ticket: '0' if numeric, '1' otherwise
    name_title: ex. 'Mr', 'Miss' ...
    norm_sibsp: logNormalized 'Fare'
Convert to str: 'Pclass'
Dummy columns: 'Pclass','Sex','Age','SibSp','Parch','norm_fare', 'cabin_adv','cabin_multiple','numeric_ticket','name_title','train_test']
</pre>

<!-- columns to add
training['multiple_cabin'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
training['numeric_ticket'] = training.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
training['name_title'] = training.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())-->

In [1]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin

In [13]:
class DummyEncoder(TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def transform(self, X, y=None, **kwargs):
        return pd.get_dummies(X, columns=self.columns)
    def fit(self, X, y=None, **kwargs):
        return self
    
    
class CustomTransformer(BaseEstimator):
    def __init__(self):
        pass
    def fit(self, documents, y=None):
        return self
    def transform(self, X):
        x_dataset = X.copy() # to avoid changes to the original dataset
        x_dataset['multiple_cabin'] = x_dataset.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
        x_dataset['numeric_ticket'] = x_dataset.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
        x_dataset['norm_sibsp'] = np.log(x_dataset.SibSp+1)
        x_dataset['Pclass'] = x_dataset.Pclass.astype(str)
        x_dataset = DummyEncoder().transform(x_dataset[['Pclass','Sex','Age',
                                       'SibSp','Parch','Fare',
                                       'multiple_cabin',
                                       'numeric_ticket',]])
        return x_dataset

<h3>2. Pre-processing steps</h3>
<pre>1) Impute columns: 'Age', 'Fare' (there are no NaN values in 'Fare' column in training.csv, however there in test.csv)
</pre>

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [7]:
pre_process = ColumnTransformer(remainder='passthrough', 
                                transformers=[('impute_fare', SimpleImputer(strategy='median'), ['Age', 'Fare'])])

<h3>Pipeline</h3>
<pre>Steps:
1) Transform columns
2) pre-processing
3) Train Model
</pre>

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [10]:
best_params = {'penalty': 'l2', 'C': 1.0, 'solver': 'newton-cg'}

In [11]:
modelPipeline = Pipeline(steps=[('transform_columns', CustomTransformer()),
                                ('pre_processing', pre_process),
                                ('classifier', LogisticRegression(**best_params))
                               ])

In [15]:
modelPipeline.fit(X, y)

Pipeline(steps=[('transform_columns', CustomTransformer()),
                ('pre_processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_fare',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age', 'Fare'])])),
                ('classifier', LogisticRegression(solver='newton-cg'))])

In [16]:
modelPipeline.score(X, y)

0.797979797979798

In [17]:
test = pd.read_csv('test.csv')

In [18]:
modelPipeline.predict(test)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

<h3>Cross-Validation</h3>


In [62]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold

Split the data into 5 folds and then fit it into the Pipeline

In [65]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    modelPipeline.fit(X_train, y_train)
    
    print(modelPipeline.score(X_test, y_test))

0.8156424581005587
0.7808988764044944
0.8314606741573034
0.7640449438202247
0.7921348314606742


#### hyper-parameter tuning

In [16]:
#split tha into training set (to fit the model), 
#validation set (to select the params of the model) 
#test set(to evaluate the performance of selected params)
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalties = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
best_score = 0

for penalty in penalties:
    for solver in solvers:
        for C in c_values:
            logReg = LogisticRegression(penalty=penalty, 
                                        C=C,
                                        solver=solver, max_iter=4000)
            scores = cross_val_score(logReg, X_trainval, y_trainval, cv=5)
            score = np.mean(scores)
            
            if score > best_score:
                best_score = score
                best_parameters = {'penalty': penalty, 'C': C, 'solver': solver}

logReg = LogisticRegression(**best_parameters)
logReg.fit(X_trainval, y_trainval)
test_score = logReg.score(X_test, y_test)

print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))

<pre>
Best score on validation set: 0.85
Best parameters:  {'penalty': 'l2', 'C': 1.0, 'solver': 'newton-cg'}
Test set score with best parameters: 0.78
</pre>