In [26]:
import pandas as pd

In [193]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [140]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

In [16]:
X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

<h3>1.</h3>
<pre>Add columns:
    multiple_cabin: '0' if passenger has only 1 cabin, '1' if more than 1
    numeric_ticket: '0' if numeric, '1' otherwise
    norm_sibsp: logNormalized 'Fare'
Convert to str: 'Pclass'
Dummy columns: 'Pclass','Sex','Age','SibSp','Parch','Fare','multiple_cabin','numeric_ticket']
</pre>

<!-- columns to add
training['multiple_cabin'] = all_data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
training['numeric_ticket'] = training.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
training['name_title'] = training.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())-->

In [49]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin

In [186]:
class DummyEncoder(TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
    def transform(self, X, y=None, **kwargs):
        return pd.get_dummies(X, columns=self.columns)
    def fit(self, X, y=None, **kwargs):
        return self
    
    
class CustomTransformer(BaseEstimator):
    def __init__(self):
        pass
    def fit(self, documents, y=None):
        return self
    def transform(self, X):
        x_dataset = X.copy() # to avoid changes to the original dataset
        x_dataset['multiple_cabin'] = x_dataset.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
        x_dataset['numeric_ticket'] = x_dataset.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
        x_dataset['norm_sibsp'] = np.log(x_dataset.SibSp+1)
        x_dataset['Pclass'] = x_dataset.Pclass.astype(str)
        x_dataset = DummyEncoder().transform(x_dataset[['Pclass','Sex','Age',
                                       'SibSp','Parch','Fare',
                                       'multiple_cabin',
                                       'numeric_ticket',]])
        return x_dataset

<h3>2. Pre-processing steps</h3>
<pre>1) Impute columns: 'Age', 'Fare' (there are no NaN values in 'Fare' column in training.csv, however there in test.csv)
</pre>

In [48]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [188]:
pre_process = ColumnTransformer(remainder='passthrough', 
                                transformers=[('impute_fare', SimpleImputer(strategy='median'), ['Age', 'Fare'])])

<h3>Pipeline</h3>
<pre>Steps:
1) Transform columns
2) pre-processing
3) Train Model
</pre>

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [189]:
modelPipeline = Pipeline(steps=[('transform_columns', CustomTransformer()),
                                ('pre_processing', pre_process),
                                ('log_reg', LogisticRegression(random_state=1))
                               ])

In [191]:
modelPipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('transform_columns', CustomTransformer()),
                ('pre_processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_fare',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age', 'Fare'])])),
                ('log_reg', LogisticRegression(random_state=1))])

In [192]:
modelPipeline.score(X_train, y_train)

0.8002244668911336

In [194]:
test = pd.read_csv('test.csv')

In [195]:
modelPipeline.predict(test)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,