In [1]:
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [2]:
KAGGLE_TITANIC_DATA_DIR = 'D:\\Kaggle\\Titanic\\data'

titanic_train_df = pd.read_csv(os.path.join(KAGGLE_TITANIC_DATA_DIR, 'train.csv'))

In [3]:
titanic_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
y_train = titanic_train_df['Survived'].get_values()

In [6]:
def extract_title(name: str) -> str:
    idx1 = name.index(',')
    idx2 = name.index('.', idx1)
    return name[idx1+1: idx2].strip()

In [7]:
class NameToTitle(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X['Title'] = X['Name'].apply(extract_title)
        return X        

In [8]:
class GenericMissingValueFiller(BaseEstimator, TransformerMixin):
    
    def __init__(self, attr_name, value):
        self.attr_name = attr_name
        self.value = value
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        X[self.attr_name].fillna(self.value, inplace=True)
        return X

In [9]:
class MostCommonValueFiller(BaseEstimator, TransformerMixin):
    
    def __init__(self, attr_name):
        self.attr_name = attr_name
        self.most_common_value = None
    
    def fit(self, X, y=None):
        self.most_common_value = X[self.attr_name].value_counts()[:1].index[0]
        return self
    
    def transform(self, X: pd.DataFrame):
        X[self.attr_name].fillna(self.most_common_value, inplace=True)
        return X

In [10]:
class AgeMissingValueFiller(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        t = X.groupby(['Title'])['Age'].mean()
        # get row indexes with missing value for Age
        idx = X['Age'].isnull()
        y = X['Title'][idx].apply(lambda x: int(t[x]))
        X.loc[idx,['Age']] = y
        return X  

In [11]:
cat_attributes = ['Pclass', 'Title', 'Sex', 'Cabin', 'Embarked']
num_attributes = ['Age', 'SibSp', 'Parch', 'Fare']

In [12]:
pipeline = Pipeline(steps=[
    ('name_to_title', NameToTitle()),
    ('cabin_fillna', GenericMissingValueFiller('Cabin', 'General')),
    ('age_fillna', AgeMissingValueFiller()),
    ('embarked_fillna', MostCommonValueFiller('Embarked'))
])

In [13]:
col_transformer = ColumnTransformer([
    ('num_attrs', StandardScaler(), num_attributes),
    ('cat_attrs', OneHotEncoder(), cat_attributes)
])

In [14]:
full_pipeline = Pipeline(steps=[
    ('initial_staging', pipeline),
    ('final_staging', col_transformer)
])

In [15]:
X_train = full_pipeline.fit_transform(titanic_train_df)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
cross_val_score(rf_clf, X_train, y_train, cv=3, scoring='accuracy')



array([0.81481481, 0.82154882, 0.8013468 ])

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.8013468 , 0.82154882, 0.82154882])