In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
import ssl
import certifi

import warnings

warnings.filterwarnings('ignore')

ssl._create_default_https_context = ssl._create_unverified_context
dataset = fetch_openml(data_id=40945, as_frame=True)

df = dataset.frame

In [32]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,num_in_party
0,1.0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",Large
1,1.0,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",Small
2,1.0,0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",Small
3,1.0,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",Small
4,1.0,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",Small


In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

def ohe(df, col):
    le = LabelEncoder()
    a = le.fit_transform(df[col]).reshape(-1, 1)
    columns = [ f'{col}_{name}' for name in le.classes_ ]
    encoder = OneHotEncoder(sparse=False, categories='auto')
    result = pd.DataFrame(encoder.fit_transform(a), columns=columns)
    return result

In [3]:
class NumInPartyTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        X['num_in_party'] = X['parch'] + X['sibsp']
        X['num_in_party'] = np.where(X['num_in_party'].isin([1]), 'Alone', 
                             np.where(X['num_in_party'].isin([2, 3, 4]), 'Small',
                             np.where(X['num_in_party'].isin([5, 6, 7]), 'Medium', 'Large')))
        result = ohe(X, 'num_in_party')
        return pd.concat([X, result], axis=1).drop(['parch', 'sibsp', 'num_in_party'], axis=1)

In [4]:
class AgeTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, buckets=10):
        super().__init__()
        self.__buckets = buckets
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        X['age'] = X.groupby(['sex', 'pclass'])['age'].apply(lambda val : val.fillna(val.mean()))

        age_labels = [f'age_{x}' for x in range(self.__buckets) ]
        X['age_bucket'] = pd.qcut(X['age'], self.__buckets, labels=age_labels)
        age_buckets = ohe(X, 'age_bucket')
        return pd.concat([X, age_buckets], axis=1).drop(['age', 'age_bucket'],axis=1)
        

In [5]:
class CabinTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        X['deck'] = X['cabin'].apply(lambda cabin : cabin[0] if pd.notnull(cabin) else 'X')
        deck_encoder = LabelEncoder()
        X['deck'] = deck_encoder.fit_transform(X['deck'])
        return X.drop(['cabin'], axis=1)


In [6]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.__columns = columns
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.__columns, axis=1)

In [7]:
class FareTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, buckets=10):
        super().__init__()
        self.__buckets = buckets
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        X['fare'] = X['fare'].fillna(0)

        fare_labels = [f'fare_{i}' for i in range(self.__buckets)]
        X['fare_bucket'] = pd.qcut(X['fare'], self.__buckets, labels=fare_labels)
        fare_buckets = ohe(X, 'fare_bucket')
        return pd.concat([X, fare_buckets], axis=1).drop(['fare', 'fare_bucket'],axis=1)

In [8]:
class EmbarkedTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        X['embarked'] = X['embarked'].fillna('S').astype('object')
        embarked_encoder = LabelEncoder()
        X['embarked'] = embarked_encoder.fit_transform(X['embarked'])
        return X

In [12]:
class SexTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        super().__init__()
        
    def fit(self, X, Y=None):
        return self
    
    def transform(self, X):
        le = LabelEncoder()
        X['sex'] = le.fit_transform(X['sex'])
        return X

In [20]:
data_pipe = Pipeline([
    ('num_in_party', NumInPartyTransformer()),
    ('age', AgeTransformer(7)),
    ('cabin', CabinTransformer()),
    ('fare', FareTransformer()),
    ('embarked', EmbarkedTransformer()),
    ('sex', SexTransformer()),
    ('cleanup', ColumnDropper(columns=['name', 'ticket', 'boat', 'home.dest', 'body', 'survived'])),
])

In [21]:
df2 = data_pipe.fit_transform(df)

df2.head()

Unnamed: 0,pclass,sex,embarked,num_in_party_Alone,num_in_party_Large,num_in_party_Medium,num_in_party_Small,age_bucket_age_0,age_bucket_age_1,age_bucket_age_2,...,fare_bucket_fare_0,fare_bucket_fare_1,fare_bucket_fare_2,fare_bucket_fare_3,fare_bucket_fare_4,fare_bucket_fare_5,fare_bucket_fare_6,fare_bucket_fare_7,fare_bucket_fare_8,fare_bucket_fare_9
0,1.0,0,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1,2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0,2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   pclass               1309 non-null   float64
 1   sex                  1309 non-null   int64  
 2   embarked             1309 non-null   int64  
 3   num_in_party_Alone   1309 non-null   float64
 4   num_in_party_Large   1309 non-null   float64
 5   num_in_party_Medium  1309 non-null   float64
 6   num_in_party_Small   1309 non-null   float64
 7   age_bucket_age_0     1309 non-null   float64
 8   age_bucket_age_1     1309 non-null   float64
 9   age_bucket_age_2     1309 non-null   float64
 10  age_bucket_age_3     1309 non-null   float64
 11  age_bucket_age_4     1309 non-null   float64
 12  age_bucket_age_5     1309 non-null   float64
 13  age_bucket_age_6     1309 non-null   float64
 14  deck                 1309 non-null   int64  
 15  fare_bucket_fare_0   1309 non-null   f

<h2>Train and store the model</h2>

In [23]:
import pickle
from sklearn.linear_model import LogisticRegression

features = df2.copy()
labels = df['survived']

model = LogisticRegression()
model.fit(features, labels)

LogisticRegression()

In [24]:
with open('model.pic', 'wb') as out_file:
    pickle.dump(model, out_file)
    pickle.dump(data_pipe, out_file)

<h2>Dumb programmer starts here</h2>

In [25]:
with open('model.pic', 'rb') as in_file:
    model = pickle.load(in_file)
    data_pipe = pickle.load(in_file)

In [26]:
df2 = data_pipe.fit_transform(df)



In [30]:
result = model.predict(df2)

In [31]:
result

array(['1', '1', '1', ..., '0', '0', '0'], dtype=object)