In [12]:
import warnings
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin


In [3]:
#impute missing values with median
class DataFrameImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.impute_dict={}
        self.feature_name=[]
    
    def fit(self,x,y=None):
        for col in x.columns:
            if x[col].dtype=='O':
                self.impute_dict[col]='missing'
            else:
                self.impute_dict[col]=x[col].mean()
        self.feature_name=x.columns
        return self
    def transform(self,x):
        x=x.fillna(self.impute_dict)
        return x
    def get_feature_names(self):
        return self.feature_names

In [3]:
d=pd.DataFrame ({'age': [20,15, np.nan, 30,35,40],
                'city': ['delhi' ,np.nan, 'pune', 'delhi', 'hyderabad','agra']})

In [4]:
d

Unnamed: 0,age,city
0,20.0,delhi
1,15.0,
2,,pune
3,30.0,delhi
4,35.0,hyderabad
5,40.0,agra


In [5]:
a=DataFrameImputer()

In [6]:
a.fit(d)

DataFrameImputer()

In [7]:
a.transform(d)

Unnamed: 0,age,city
0,20.0,delhi
1,15.0,missing
2,28.0,pune
3,30.0,delhi
4,35.0,hyderabad
5,40.0,agra


In [19]:
test=pd.DataFrame ({'age': [20,19, np.nan],
                'city': ['delhi' ,np.nan, 'latur']})

In [20]:
a.transform(test)

Unnamed: 0,age,city
0,20.0,delhi
1,19.0,missing
2,28.0,latur


In [4]:
class VarSelector(BaseEstimator, TransformerMixin):
    def __init__(self, var_names):
        self.feature_names=var_names
    def fit(self, x,y=None):
        return self
    def transform(self,X):
        return X[self.feature_names]
    def get_feature_names(self):
        return self.feature_names

In [5]:
class convert_to_numeric(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names=[]
    def fit(self, x,y=None):
        self.feature_names=x.columns
        return self
    def transform(self,X):
        for col in X.columns:
            X[col]=pd.to_numeric(X[col],errors='coerce')
        return X
    def get_feature_names(self):
        return self.feature_names

In [6]:
class custom_fico(BaseEstimator, TransformerMixin):
    def __init__(self, var_names):
        self.feature_names=var_names['fico']
    def fit(self, x,y=None):
        return self
    def transform(self,X):
        k=X['FICO.Range'].str.split('-',expand=True).astype(float)
        fico=0.5*(k[0]+k[1])
        return pd.DataFrame({'fico':fico})
    def get_feature_names(self):
        return self.feature_names

In [7]:
class string_clean(BaseEstimator, TransformerMixin):
    def __init__(self, replace_it='', reaplce_with=''):
        self.replace_it=replace_it
        self.reaplce_with=reaplce_with
        self.feature_names=[]
    def fit(self, x,y=None):
        self.feature_names=x.columns
        return self
    def transform(self,X):
        for col in X.columns:
            X[col]=X[col].str.replace(self.replace_it, self.replace_with)
        return X
    def get_feature_names(self):
        return self.feature_names

In [8]:
class get_dummies_Pipe(BaseEstimator, TransformerMixin):
    def __init__(self, freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        self.feature_names=[]
    def fit(self, x,y=None):
        data_cols=x.columns
        for col in data_cols:
            k=x[cols].value_counts()
            if(k<=self.freq_cutoff).sum()==0:
                cats=k.index[:-1]
            else:
                cats=k.index[k>self.freq_cutoff]
            self.var_cat_dict[col]=cats
        
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                self.feature_names.append(col+'_'+cat)
        return self
    def transform(self,x,y=None):
        dummy_data=x.copy()
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)
            del dummy_data[col]
        return dummy_data
    def get_feature_names(self):
        return self.feature_names