ML real world problems are varied and may require some tailor to suit the need. In this tutorial, we exaplore an simple example to create a simple estimator [readme](http://scikit-learn.org/dev/developers/contributing.html#rolling-your-own-estimator).   
1. At first, we need to choose one of these: __Classifier, Clusterring, Regressor and Transformer__. The classifier is self-explanatory, we give some input X and get the class of which it probably belongs (e.g. Naive Bayes Classifier). An example of Regressor is e.g. Linear Regression which get input X and get estimations of variable Y. Another example, Transformer, is for transforming the data -- it takes X and returns changed X. An example of this might be PCA.   
2. After that we need to decide which one suits to our needs our subclass __BaseEstimator__ and an appropriate class for your type (one of ClassifierMixin, RegressorMixin, ClusterMixin, TransformerMixin).  
 [ExtraReading](http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/)  
 A custom sklearn  estimator consists of at least three methods.

An  __init__ initialization method   
A __fit__ method to train the estimator   
A __predict__ method to perform a prediction on unseen data

# Example, inherit from the classes BaseEstimator, ClassifierMixin
class RidgeClassifier(BaseEstimator, ClassifierMixin):      
       
    def __init__(self,param1,param2):      
        self.param1 = param1      
        self.param2 = param2      
    def fit(self, X, y=None):      
        # do something   
        return self   
    def predict(self, X_test):   
        # do something   
        return y_pred   

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin

In [5]:
bc = load_breast_cancer()
new_feature_names = ['_'.join(e.split()) for e in bc.feature_names]

X = pd.DataFrame(bc.data, columns =new_feature_names)
y =bc.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state =7, stratify =y)


Create a new class

In [9]:
class RidgeClassifier(BaseEstimator, ClassifierMixin):
    """A classifier made from Ridge Regression"""
    def __init__(self, alpha=0):
        self.alpha = alpha
    def fit(self,X,y = None) :
        # passalong the alpha parameter to theinernal ridge estimator and perform a fit using it
        self.ridge_regressor = Ridge(alpha =self.alpha)
        self.ridge_regressor.fit(X,y)
          # save the seen class labels
        self.class_labels =np.unique(y)   
        return self
    
    def predict(self,X_test):
        # store the results of the internal ridge regressor estimator
        results = self.ridge_regressor.predict(X_test)
        # find the nearest class labels
        
        return np.array([self.class_labels[np.abs(self.class_labels - x).argmin()] for x in results])
        

Apply a new classifier

In [10]:
r_classifier = RidgeClassifier(1.5)
r_classifier.fit(X_train, y_train)
r_classifier.score(X_test, y_test)

0.95744680851063835

Excute hyper parameters

In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha':[0,0.5,1.0,1.5,2.0]}
gs_rc = GridSearchCV(RidgeClassifier(), param_grid,cv =3).fit(X_train,y_train)

gs_rc.cv_results_

{'mean_fit_time': array([ 0.00199978,  0.00200105,  0.00266838,  0.00133467,  0.00166138]),
 'mean_score_time': array([ 0.00149965,  0.00133332,  0.00100048,  0.00066646,  0.00134802]),
 'mean_test_score': array([ 0.94750656,  0.95800525,  0.96062992,  0.96062992,  0.96062992]),
 'mean_train_score': array([ 0.96060881,  0.95142756,  0.95272442,  0.95403675,  0.95403675]),
 'param_alpha': masked_array(data = [0 0.5 1.0 1.5 2.0],
              mask = [False False False False False],
        fill_value = ?),
 'params': [{'alpha': 0},
  {'alpha': 0.5},
  {'alpha': 1.0},
  {'alpha': 1.5},
  {'alpha': 2.0}],
 'rank_test_score': array([5, 4, 1, 1, 1]),
 'split0_test_score': array([ 0.953125,  0.96875 ,  0.96875 ,  0.96875 ,  0.96875 ]),
 'split0_train_score': array([ 0.9486166 ,  0.94466403,  0.94071146,  0.94071146,  0.94071146]),
 'split1_test_score': array([ 0.94488189,  0.96062992,  0.96850394,  0.96850394,  0.96850394]),
 'split1_train_score': array([ 0.96850394,  0.95275591,  0.95275591

In [17]:
r_classifier.score(X_test,y_test)

0.95744680851063835

# execute Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test, y_test)

0.9521276595744681

## Build a new classifier to work with sklearn from other packages
 The following problem is credited from [Scikitlearn_cookbook 2 edition]. There we use __general estimating equation__ (GEE) from [this](http://www.statsmodels.org/dev/gee.html)

In [23]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, ClassifierMixin

In [30]:
class GEEClassifier(BaseEstimator, ClassifierMixin):
    """ A Classifier made from statsmodels' Generalized Estimating Equations at http://www.statsmodels.org/dev/gee.html"""
    def __init__(self, group_by_feature):
        self.group_by_feature = group_by_feature
        
    def fit(self,X,y =None):
        self.fam = sm.families.Poisson()
        self.ind = sm.cov_struct.Exchangeable()
        def expand_X(X,y,desired_group):
            X_plus = X.copy()
            X_plus['y'] = y
            X_plus[desired_group+'group'] = (X_plus[desired_group] *10)//10
            return X_plus
        self.class_labels = np.unique(y)
        dataframe_feature_names = X.columns
        not_group_by_features =[x for x in dataframe_feature_names if x !=self.group_by_feature]
        formula_in ='y~'+'+'.join(not_group_by_features)
        data = expand_X(X,y, self.group_by_feature)
        self.mod = smf.gee(formula_in,
                          self.group_by_feature +"_group",
                          data,
                          cov_struct = self.ind,
                          family = self.fam)
        self.res = self.mod.fit()
        return self
    
    def predict(self,X_test):
        results = self.res.predict(X_test)
        return np.array([self.class_labels[np.abs(self.class_labels -x).argmin()] for x in results])
    
    def print_fit_summary(self):
        print(res.summary())
        return self

In [31]:
gee_classifier = GEEClassifier('mean_concavity')
gee_classifier.fit(X_train, y_train)
gee_classifier.score(X_test,y_test)

KeyError: 'mean_concavity_group'

In [32]:
X_train.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
85,18.46,18.52,121.1,1075.0,0.09874,0.1053,0.1335,0.08795,0.2132,0.06022,...,22.93,27.68,152.2,1603.0,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579
316,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,...,12.85,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037
350,11.66,17.07,73.7,421.0,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,...,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
62,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,...,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132
153,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,...,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772


** Problem:** Create MeanClassifier. [from](http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/)

In [3]:
import pandas as pd
dat = {'first_name': ['Bob','Ellen', np.nan, 'Anna', 'John'], 
        'last_name': ['George', 'William', np.nan,'Bush',  'Conan'], 
        'age': [18, 17, np.nan, 17, 18], 
        'sex': ['f', 'm',np.nan,  'm', 'f'], 
        'midterm': [ 89,np.nan, np.nan, 87, 90],
        'Final': [91, np.nan,94, np.nan, 94]}
df = pd.DataFrame(dat, columns = ['first_name', 'last_name', 'age', 'sex', 'midterm', 'Final'])
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


In [4]:
# scikit learn impute only process with numeric value, the following solution can solve this problem with category from
# https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn

import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value in column.
        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

#alternative, use data = [['a', 1, 2], []'b', 1, 1], ['b', 2, 2],[np.nan, np.nan, np.nan]]

df_pre = DataFrameImputer().fit_transform(df)
print('before...')
print(df)
print('after...')
print(df_pre)

before...
  first_name last_name   age  sex  midterm  Final
0        Bob    George  18.0    f     89.0   91.0
1      Ellen   William  17.0    m      NaN    NaN
2        NaN       NaN   NaN  NaN      NaN   94.0
3       Anna      Bush  17.0    m     87.0    NaN
4       John     Conan  18.0    f     90.0   94.0
after...
  first_name last_name   age sex    midterm  Final
0        Bob    George  18.0   f  89.000000   91.0
1      Ellen   William  17.0   m  88.666667   93.0
2       Anna    George  17.5   f  88.666667   94.0
3       Anna      Bush  17.0   m  87.000000   93.0
4       John     Conan  18.0   f  90.000000   94.0
