Estimators
- Predictors
  - Regressors
  - Classifiers
- Transformers

Scikit-learn has a base class called BaseEstimator that all estimators inherit.

The models inherit additional classes:
- RegressorMixin
- ClassifierMixin
- TransformerMixin
We will use these in custom estimators.

Metrics for regression:
- MSE
- RMSE
- MAE
- R2 (quantifies how this model's MSE compares to a naive model in which we always predict the mean. Value < 0 indicates bad model.

Metrics for classifier:
- Accuracy, precision, recall

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
df=pd.read_excel('/content/gdrive/My Drive/Colab Notebooks/ML DL show/50_Startups.xlsx')
df.head() 
# df.tail() -> last 5 lines
# head(3) tail(2)
# df.sample(n) -> random number of rows (n)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [16]:
df.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


# Custom Transformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, q_lower, q_upper):
        self.q_lower = q_lower
        self.q_upper = q_upper
    
    def fit(self, X, y=None):
        self.upper = np.percentile(X, self.q_upper, axis=0)
        self.lower = np.percentile(X, self.q_lower, axis=0)
        
        return self    # generally all scikit-learn transformers return the fitted transformer when we call .fit()
    
    def transform(self, X):
        Xt = X.copy()
        ind_lower = X < self.lower
        ind_upper = X > self.upper
        
        for i in range(X.shape[-1]):
            # Xt[ind_lower[:, i], i] = self.lower[i]
            # Xt[ind_upper[:, i], i] = self.upper[i]
            Xt.iloc[ind_lower.iloc[:, i].values,i] = self.lower[i]  # value is less than self.lower
            Xt.iloc[ind_upper.iloc[:, i].values,i] = self.upper[i]  # value is more than self.upper
        
        return Xt

In [None]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [None]:
np.percentile(df.drop(['State'],axis=1),5,axis=0)

array([  748.231 , 73613.302 ,   856.7685, 45678.689 ])

In [None]:
np.percentile(df.drop(['State'],axis=1),95,axis=0)

array([149360.415, 155763.844, 396803.826, 187383.61 ])

In [None]:
# removing outliers from all the columns except 'State'
replacer = OutlierReplacer(5, 95)
replacer.fit(df.drop(['State'],axis=1))
Xt = replacer.transform(df.drop(['State'],axis=1))
Xt[-4:]

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
46,1315.46,115816.21,297114.46,49490.75
47,748.231,135426.92,856.7685,45678.689
48,748.231,73613.302,856.7685,45678.689
49,748.231,116983.8,45173.06,45678.689


In [None]:
# num_cols= df.columns[:3]
column_trans3= make_column_transformer((OutlierReplacer(5,95), num_cols), remainder='passthrough')
column_trans3.fit_transform(df)

array([[149360.41499999998, 136897.8, 396803.8259999999, 'New York',
        192261.83],
       [149360.41499999998, 151377.59, 396803.8259999999, 'California',
        191792.06],
       [149360.41499999998, 101145.55, 396803.8259999999, 'Florida',
        191050.39],
       [144372.41, 118671.85, 383199.62, 'New York', 182901.99],
       [142107.34, 91391.77, 366168.42, 'Florida', 166187.94],
       [131876.9, 99814.71, 362861.36, 'New York', 156991.12],
       [134615.46, 147198.87, 127716.82, 'California', 156122.51],
       [130298.13, 145530.06, 323876.68, 'Florida', 155752.6],
       [120542.52, 148718.95, 311613.29, 'New York', 152211.77],
       [123334.88, 108679.17, 304981.62, 'California', 149759.96],
       [101913.08, 110594.11, 229160.95, 'Florida', 146121.95],
       [100671.96, 91790.61, 249744.55, 'California', 144259.4],
       [93863.75, 127320.38, 249839.44, 'Florida', 141585.52],
       [91992.39, 135495.07, 252664.93, 'California', 134307.35],
       [119943.24, 

## Missing value custom imputer

## Missing value custom imputer

In [4]:
df_new=df.copy()
df_new.iloc[46,0]=np.nan
df_new.iloc[47,1]=np.nan
df_new.iloc[48,2]=np.nan
df_new.iloc[49,4]=np.nan
df_new.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,,115816.21,297114.46,Florida,49490.75
47,0.0,,0.0,California,42559.73
48,542.05,51743.15,,New York,35673.41
49,0.0,116983.8,45173.06,California,


In [5]:
np.mean(df_new.drop(['State'],axis=1)).values # mean of each feature

array([ 75199.2922449 , 121057.24612245, 215331.73244898, 113998.99102041])

In [None]:
df_new[['R&D Spend', 'Administration', 'Marketing Spend',  'Profit']].isna()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MeanMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy):
        self.strategy = strategy
    
    def fit(self, X, y=None):
        if self.strategy=='mean':
          self.val= np.mean(X, axis=0).values
        else:
          self.val= np.median(X, axis=0).values
        return self
    
    def transform(self, X):
        Xt = X.copy()
        # print(Xt.tail())
        # print('##',self.val)
        ind_missing = X.isna()
        
        for i in range(X.shape[-1]):
            # Xt[ind_lower[:, i], i] = self.lower[i]
            # Xt[ind_upper[:, i], i] = self.upper[i]
            Xt.iloc[ind_missing.iloc[:, i].values,i] = self.val[i]
        
        return Xt



In [None]:
from sklearn.compose import make_column_transformer

In [None]:
column_trans4= make_column_transformer((MeanMedianImputer('mean'), ['R&D Spend', 'Administration', 'Marketing Spend',  'Profit']), remainder='passthrough')
column_trans4.fit_transform(df_new)[-5:]

array([[1000.23, 124153.04, 1903.93, 64926.08, 'New York'],
       [75199.29224489795, 115816.21, 297114.46, 49490.75, 'Florida'],
       [0.0, 121057.24612244894, 0.0, 42559.73, 'California'],
       [542.05, 51743.15, 215331.73244897963, 35673.41, 'New York'],
       [0.0, 116983.8, 45173.06, 113998.99102040817, 'California']],
      dtype=object)

In [None]:
df_new.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,,115816.21,297114.46,Florida,49490.75
47,0.0,,0.0,California,42559.73
48,542.05,51743.15,,New York,35673.41
49,0.0,116983.8,45173.06,California,


In [None]:
df.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


## Custom Regressor

In [7]:
from sklearn.base import BaseEstimator, RegressorMixin

class MeanRegressor(BaseEstimator, RegressorMixin):
  def __init__(self):
    pass
  def fit(self, X, y):
    self.y_mean= np.mean(y);
    return self

  def predict(self, X):
    return self.y_mean*np.ones(X.shape[0])

In [8]:
X=df.drop(['Profit'], axis=1)
y=df['Profit']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
reg= MeanRegressor()
reg.fit(X_train,y_train)
reg.predict(X_test)

array([110888.21575758, 110888.21575758, 110888.21575758, 110888.21575758,
       110888.21575758, 110888.21575758, 110888.21575758, 110888.21575758,
       110888.21575758, 110888.21575758, 110888.21575758, 110888.21575758,
       110888.21575758, 110888.21575758, 110888.21575758, 110888.21575758,
       110888.21575758])

In [15]:
reg.score(X_test,y_test)

-0.007686370775541906