In [6]:
import pandas as pd

train = pd.read_csv('train.csv')
train.tail()

# age에 NaN값이 존재

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
# 전처리(모든 age값의 평균으로 채워 넣기)
from sklearn.base import BaseEstimator, TransformerMixin

class AgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Age'].fillna(X['Age'].mean(), inplace=True)
        X['Age'] = X['Age'].astype('int')
        
        return X

In [8]:
age_transform = AgeTransformer()
age_transform.fit_transform(train).tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q


In [9]:
# 전처리(Name값을 first name만 표시)
from sklearn.base import BaseEstimator, TransformerMixin

class NameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.name_ls = []
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for name_idx in range(len(X['Name'])):
            self.name_ls.append(X['Name'][name_idx].split(',')[0])
        X['Name'] = self.name_ls
        
        return X

In [10]:
name_transform = NameTransformer()
name_transform.fit_transform(train).tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,Montvila,male,27,0,0,211536,13.0,,S
887,888,1,1,Graham,female,19,0,0,112053,30.0,B42,S
888,889,0,3,Johnston,female,29,1,2,W./C. 6607,23.45,,S
889,890,1,1,Behr,male,26,0,0,111369,30.0,C148,C
890,891,0,3,Dooley,male,32,0,0,370376,7.75,,Q


In [11]:
# 위의 Age 변환기와 Name 변환기를 연결하는 Pipeline
from sklearn.pipeline import Pipeline

titanic_pipeline = Pipeline([
    ('age_transform', AgeTransformer()),
    ('name_transform', NameTransformer()),
])

In [13]:
trans_train = titanic_pipeline.fit_transform(train)
trans_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,Montvila,male,27,0,0,211536,13.0,,S
887,888,1,1,Graham,female,19,0,0,112053,30.0,B42,S
888,889,0,3,Johnston,female,29,1,2,W./C. 6607,23.45,,S
889,890,1,1,Behr,male,26,0,0,111369,30.0,C148,C
890,891,0,3,Dooley,male,32,0,0,370376,7.75,,Q
