# PYTHON PLUMBING 101
*'What be we all but not simple plumbers, plumbing the depths of our knowledge.'* 

__-Benjamin Franklin__

Pipelines look more complicated than they actually are. Lets take a look at how to build our own!

<img src='images/refinery_pipes.jpg'/>

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from src.pipeline_classes import Featurizer, Imputer, Standardizer, Dummifier
import src.model as model
import pickle

In [2]:
# unzip data.zip to inflate it into a .json file
!unzip data/data.zip
# move file from the working directory to the data subdirectory
!mv data.json data/training_data.json

Archive:  data/data.zip
  inflating: data.json               


In [3]:
# load raw training data
X, y = model.load('data/training_data.json')

In [36]:
# look at data
X.head(2)

Unnamed: 0,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,1265630400,1263110000.0,...,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,1296288000,1293833000.0,...,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC


In [37]:
y.head(3)

0     True
1    False
2    False
Name: fraud, dtype: bool

## Lets build a pipeline and fit it!

## STEP 1: Create Pipeline

<img src='images/step1.png'/>

In [48]:
pipe = Pipeline([
        ('featurizer', Featurizer(['body_length', 
                            'channels', 
                            'country', 
                            'currency', 
                            'description', 
                            'email_domain', 
                            'event_created', 
                            'event_end',
                            'event_published', 
                            'event_start', 
                            'fb_published', 
                            'has_analytics',
                            'has_header', 
                            'has_logo', 
                            'listed', 
                            'name', 
                            'name_length', 
                            'object_id',
                            'org_desc', 
                            'org_facebook', 
                            'org_name', 
                            'org_twitter', 
                            'payee_name',
                            'payout_type', 
                            'previous_payouts', 
                            'sale_duration', 
                            'show_map',
                            'ticket_types', 
                            'user_age', 
                            'user_created', 
                            'user_type',
                            'venue_address', 
                            'venue_country', 
                            'venue_latitude', 
                            'venue_longitude',
                            'venue_name', 
                            'venue_state'])),
        ('imputer', Imputer(cols_dict={'body_length': 'cont', 
                             'channels': 'cat', 
                             'country': 'cat', 
                             'currency': 'cat', 
                             'fb_published': 'cat', 
                             'has_analytics': 'cat', 
                             'has_header': 'cat', 
                             'has_logo': 'cat', 
                             'listed': 'cat', 
                             'name_length': 'cont', 
                             'payout_type': 'cat', 
                             'sale_duration': 'cont', 
                             'show_map': 'cat', 
                             'user_age': 'cont', 
                             'user_type': 'cat', 
                             'event_duration': 'cont', 
                             'has_payee_name': 'cat', 
                             'has_previous_payouts': 'cat', 
                             'has_payout_type': 'cat', 
                             'has_facebook': 'cat', 
                             'has_twitter': 'cat'})),
        ('dummifier', Dummifier(['channels', 
                                  'country', 
                                  'currency', 
                                  'fb_published', 
                                  'has_analytics', 
                                  'has_header', 
                                  'has_logo', 
                                  'listed',
                                  'payout_type', 
                                  'show_map', 
                                  'user_type', 
                                  'has_payee_name', 
                                  'has_previous_payouts',
                                  'has_payout_type', 
                                  'has_facebook', 
                                  'has_twitter'])),
        ('model', RandomForestClassifier(n_estimators=500, 
                                         max_depth=25))
        ])

## STEP 2: Fit the entire pipeline

<img src='images/step2.png'/>

In [8]:
# fit the classes and train the model
pipe.fit(X, y)

Pipeline(memory=None,
     steps=[('featurizer', Featurizer(cols=['body_length', 'channels', 'country', 'currency', 'description', 'email_domain', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'name', 'name_length', 'object_id', 'org_desc',...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## STEP 3: Deploy the pipeline

We now have a fit pipeline with a fit model!

<img src='images/step3.png'/>

### We can now do one of two things:
1. We can use the pipeline to tranform our data and use the trained model to make predictions, or
2. We can pickle our pipeline object and move it to another machine to be used to tranform data and make predictions.

### Option 1: tranform and predict

In [9]:
new_data = pd.read_csv('data/new_data.csv')

In [10]:
# lets look at the new raw data:
new_data.head(1)

Unnamed: 0,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,432,0,US,USD,0.0,"<p><span><span class=""fsl"">LOUD Championship E...",gmail.com,1365694066,1369018800,1365694000.0,...,"[{'event_id': 6225359, 'cost': 20.0, 'availabi...",1155,1265937792,1,905 Atlantic ave.,US,40.68097,-73.962861,Freecandy,NY


<img src='images/step4.png'/>

In [11]:
predictions = pipe.predict_proba(new_data)

In [12]:
success_probability = predictions.T[1]

In [13]:
#show first 10 probabilities
success_probability[:10]

array([2.00000000e-02, 1.40000000e-02, 0.00000000e+00, 1.06951872e-05,
       1.21527891e-02, 2.00000000e-03, 1.51649575e-01, 6.00000000e-03,
       2.02040816e-03, 0.00000000e+00])

### Option 2: Pickle, Send, Unpickle, Transform, Predict.

<img src='images/step5.png'/>

Save `pipe` object to a pickle file

In [14]:
output_file = 'data/pickled_pipe.pkl'

In [15]:
with open(output_file, 'wb') as f:
        pickle.dump(pipe, f)

You can now `move` the pickle file anywhere you want!

To unpickle the `pipe` object:

In [16]:
input_file = 'data/pickled_pipe.pkl'

In [17]:
unpickled_pipe = pickle.load(open(input_file, 'rb'))

In [18]:
unpickled_pipe.named_steps

{'featurizer': Featurizer(cols=['body_length', 'channels', 'country', 'currency', 'description', 'email_domain', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'name', 'name_length', 'object_id', 'org_desc', 'org_facebook', 'org_name', 'org... 'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state']),
 'imputer': Imputer(cols_dict={'body_length': 'cont', 'channels': 'cat', 'country': 'cat', 'currency': 'cat', 'fb_published': 'cat', 'has_analytics': 'cat', 'has_header': 'cat', 'has_logo': 'cat', 'listed': 'cat', 'name_length': 'cont', 'payout_type': 'cat', 'sale_duration': 'cont', 'show_map': 'cat', 'user_age': 'cont', 'user_type': 'cat', 'event_duration': 'cont', 'has_payee_name': 'cat', 'has_previous_payouts': 'cat', 'has_payout_type': 'cat', 'has_facebook': 'cat', 'has_twitter': 'cat'}),
 'dummifier': Dummifier(cols_to_dummy=['channels', 'country', 'currenc

## SUMMARY- All the steps:

<img src='images/step6.png'/>

### Other Pipeline Patterns

<img src='images/other_pipeline.png'/>

https://medium.com/bigdatarepublic/integrating-pandas-and-scikit-learn-with-pipelines-f70eb6183696

https://github.com/madrury/regression-tools/blob/master/regression_tools/dftransformers.py

## HOW TO BUILD A CUSTOM PIPELINE CLASS

### Basic structure:

In [20]:


# framework on which to build custom sklearn pipeline classes
class PipeClassFramework(BaseEstimator, TransformerMixin):
    def __init__(self, param=None):
        self.param = param
        
    def fit(self, X, y=None):
        return self
    
    def transform(X):
        X = X.copy()
        return X
    
    @staticmethod
    def helper_function(X):
        return X.shape
    
    

- If applicable, parameters need to be assigned in the `__init__`() method
- The name of `param` within the `__itit__(self, param)` needs to match the name `self.params` exactly
- The fit method must return itself, even if there is no need to fit anything
- The class should inherit traits from `BaseEstimator` and `TransformerMixin`
- The `fit( )` method must take in X and y, even though y may not be necessary. Set `y=None`. 
- The `transform( )` method take in only X.
- It is best practice to make a copy of X (`X = X.copy()`) in the tranform to avoid transforming the original dataframe.

### EXAMPLES PIPELINE CLASSES:

In [6]:


class Featurizer1(BaseEstimator, TransformerMixin):
    """Clean incoming df to fit into model"""
    
    def __init__(self, cols=None):
        """INPUT: a data_type_dict to determine which columns are 
                  continueous and categorical
                  an optional cols list of columns to select"""
        self.cols = cols
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """tranform and clean incoming training or test"""
        df = X.copy()
        df = df.loc[:,self.cols]
        df['event_duration'] = df['event_end']-df['event_start']
        df['has_payee_name'] = df['payee_name'].apply(self.is_empty)
        df['has_header'] = df['has_header'].fillna(0)
        df['has_previous_payouts'] = df['previous_payouts'].apply(self.is_empty)
        df['has_payout_type'] = df['payout_type'].apply(self.is_empty)
        df['has_facebook'] = df['org_facebook'].apply(self.is_not_zero)
        df['has_twitter'] = df['org_twitter'].apply(self.is_not_zero)
        df['country'] = df['country'].apply(self.replace_empty_with_none)
        drop_list = ['description',
                    'event_created',
                    'event_end',
                    'event_published',
                    'event_start',
                    'name',
                    'object_id',
                    'payee_name',
                    'ticket_types',
                    'user_created',
                    'venue_address',
                    'venue_country',
                    'venue_longitude',
                    'venue_latitude',
                    'venue_name',
                    'venue_state',
                    'previous_payouts',
                    'email_domain',
                    'org_name',
                    'org_twitter',
                    'org_facebook',
                    'org_desc']
        return df.drop(drop_list, axis=1)

    @staticmethod  
    def is_not_zero(x):
        if x == 0:
            return 0
        return 1

    @staticmethod
    def is_empty(x):
        if not x:
            return 0
        return 1

    @staticmethod
    def max_cost(row):
        """Find the hightest ticket price from a row in df['ticket_types']
        input: [{'event_id': 527017,
                'cost': 25.0,
                'availability': 1,
                'quantity_total': 800,
                'quantity_sold': 0},
                {'event_id': 527017,
                'cost': 50.0,
                'availability': 1,
                'quantity_total': 100,
                'quantity_sold': 0},
                {'event_id': 527017,
                'cost': 550.0,
                'availability': 1,
                'quantity_total': 20,
                'quantity_sold': 0}]
        output: 550.0 """
        maximum = 0
        for item in row:
            if item['cost'] >= maximum:
                maximum = item['cost']
        return maximum
    
    @staticmethod
    def replace_empty_with_none(x):
        if not x:
            return 'None'
        else: 
            return x
        
        

In [7]:


class Imputer1(BaseEstimator, TransformerMixin):
    """Impute either mode or mean into cleaned and dummied data"""
    def __init__(self, cols_dict=None):
        self.cols_dict = cols_dict

    def fit(self, X, y=None):
        """save the values to impute into each column"""
        df = X
        self.averages = {}
        for col, val in self.cols_dict.items():
            if val=='cat':
                self.averages[col] = 'None'
            if val=='cont':
                self.averages[col] = df.loc[:,col].mean()
        return self

    def transform(self, X):
        """for each column in df, impute the columns mean or mode if nan"""
        df = X.copy()
        for col in df.columns:
            df[col] = df[col].fillna(self.averages[col])
        return df
    
    

In [8]:


class Dummifier1(BaseEstimator, TransformerMixin):
    """Dummify certain columns in a DataFrame"""
    def __init__(self, cols_to_dummy=None):
        self.cols_to_dummy = cols_to_dummy 
        self.unique_items = {}

    def fit(self, X, y=None):
        df = X
        for col in self.cols_to_dummy:
            self.unique_items[col] = df[col].unique()
        return self
            
    def transform(self, X):
        df = X.copy()
        dummy_df = pd.DataFrame()
        for col in self.cols_to_dummy:
            columns = self.unique_items[col]
            for item in columns:
                if item==None:
                    continue
                dummy_df[f'{col}_{item}'] = df[col]==item
            dummy_df = dummy_df.iloc[:,:-1]    
        df = df.drop(self.cols_to_dummy, axis=1)
        dummy_df = dummy_df.astype(int)
        df = pd.concat([df, dummy_df], axis=1)
        return df
    
    

## Assignment:
### Lets Build our own Standardizer Class!

In our example above, we are missing a Standardizer object in our pipeline that learns how to standardize given columns of a DataFrame, then standardizes them. 

First, lets run our raw data through the previous classes to see what they iteratively give us.

In [100]:
# get the raw data again:
X, y = model.load('data/training_data.json')

In [112]:
# Lets take a peek at X:
X.head(2)

Unnamed: 0,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,1265630400,1263110000.0,...,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,1296288000,1293833000.0,...,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC


### Featurize

In [9]:
# create a featurizer object. The params are the columns and the order in which we wish to save them into a DataFrame:
cols = ['body_length', 
          'channels', 
          'country', 
          'currency', 
          'description', 
          'email_domain', 
          'event_created', 
          'event_end',
          'event_published', 
          'event_start', 
          'fb_published', 
          'has_analytics',
          'has_header', 
          'has_logo', 
          'listed', 
          'name', 
          'name_length', 
          'object_id',
          'org_desc', 
          'org_facebook', 
          'org_name', 
          'org_twitter', 
          'payee_name',
          'payout_type', 
          'previous_payouts', 
          'sale_duration', 
          'show_map',
          'ticket_types', 
          'user_age', 
          'user_created', 
          'user_type',
          'venue_address', 
          'venue_country', 
          'venue_latitude', 
          'venue_longitude',
          'venue_name', 
          'venue_state']
featurizer = Featurizer1(cols=cols)

In [10]:
# There is no fit method in this class, it just transforms data without saving any information
X_featurized = featurizer.transform(X)

In [11]:
# Lets take a peek:
X_featurized.head(2)

Unnamed: 0,body_length,channels,country,currency,fb_published,has_analytics,has_header,has_logo,listed,name_length,...,sale_duration,show_map,user_age,user_type,event_duration,has_payee_name,has_previous_payouts,has_payout_type,has_facebook,has_twitter
0,3852,5,US,USD,0,0,1.0,0,y,60,...,29.0,1,36,1,36000,0,0,0,0,0
1,3499,0,US,USD,0,0,0.0,1,n,27,...,28.0,0,149,3,32400,1,1,1,0,1


### Impute

In [12]:
# Create an imputer object. The params are dictionaries identifying which columns are categorical and which are continuous:
cols_dict = {'body_length':'cont', 
             'channels':'cat', 
             'country':'cat', 
             'currency':'cat', 
             'fb_published':'cat', 
             'has_analytics':'cat', 
             'has_header':'cat', 
             'has_logo':'cat', 
             'listed':'cat',
             'name_length':'cont', 
             'payout_type':'cat', 
             'sale_duration':'cont', 
             'show_map':'cat', 
             'user_age':'cont',
             'user_type':'cat', 
             'event_duration':'cont', 
             'has_payee_name':'cat', 
             'has_previous_payouts':'cat',
             'has_payout_type':'cat', 
             'has_facebook':'cat', 
             'has_twitter':'cat'}
imputer = Imputer1(cols_dict=cols_dict)

In [13]:
# Fit the imputer with the featurized X data
# What is the fit method returning?
imputer.fit(X_featurized)

Imputer1(cols_dict={'body_length': 'cont', 'channels': 'cat', 'country': 'cat', 'currency': 'cat', 'fb_published': 'cat', 'has_analytics': 'cat', 'has_header': 'cat', 'has_logo': 'cat', 'listed': 'cat', 'name_length': 'cont', 'payout_type': 'cat', 'sale_duration': 'cont', 'show_map': 'cat', 'user_age': 'cont', 'user_type': 'cat', 'event_duration': 'cont', 'has_payee_name': 'cat', 'has_previous_payouts': 'cat', 'has_payout_type': 'cat', 'has_facebook': 'cat', 'has_twitter': 'cat'})

In [14]:
# Transfrom the featurized data
X_imputed = imputer.transform(X_featurized)

In [15]:
# Lets take a peek:
X_imputed.head(2)

Unnamed: 0,body_length,channels,country,currency,fb_published,has_analytics,has_header,has_logo,listed,name_length,...,sale_duration,show_map,user_age,user_type,event_duration,has_payee_name,has_previous_payouts,has_payout_type,has_facebook,has_twitter
0,3852,5,US,USD,0,0,1.0,0,y,60,...,29.0,1,36,1,36000,0,0,0,0,0
1,3499,0,US,USD,0,0,0.0,1,n,27,...,28.0,0,149,3,32400,1,1,1,0,1


### Dummify

In [16]:
# create a dummifier object. The params are the categorical columns:
cols_to_dummy = ['channels', 
                 'country', 
                 'currency', 
                 'fb_published', 
                 'has_analytics', 
                 'has_header', 
                 'has_logo', 
                 'listed',
                 'payout_type', 
                 'show_map', 
                 'user_type', 
                 'has_payee_name', 
                 'has_previous_payouts',
                 'has_payout_type', 
                 'has_facebook', 
                 'has_twitter']
dummifier = Dummifier1(cols_to_dummy=cols_to_dummy)

In [17]:
# fit the dummifier object with the inputed data:
dummifier.fit(X_imputed)

Dummifier1(cols_to_dummy=['channels', 'country', 'currency', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'payout_type', 'show_map', 'user_type', 'has_payee_name', 'has_previous_payouts', 'has_payout_type', 'has_facebook', 'has_twitter'])

In [18]:
# transform the imputed data
X_dummified = dummifier.transform(X_imputed)

In [19]:
# lets take a peek
X_dummified.head(2)

Unnamed: 0,body_length,name_length,sale_duration,user_age,event_duration,channels_5,channels_0,channels_8,channels_6,channels_11,...,user_type_1,user_type_3,user_type_4,user_type_5,user_type_103,has_payee_name_0,has_previous_payouts_0,has_payout_type_0,has_facebook_0,has_twitter_0
0,3852,60,29.0,36,36000,1,0,0,0,0,...,1,0,0,0,0,1,1,1,1,1
1,3499,27,28.0,149,32400,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0


### Standardize
Use the X_dummified DataFrame to make your own class object that standardized the continous features of the DataFrame.
The continuous columns are: 
- body_length
- name_length 
- sale_duration 
- user_age 
- event_duration

Let's build a class called `Standardizer1` that takes in a DataFrame with the `parameter` we pass into the class being a list of columns we wish to standardize. The `fit()` method will assign the `mean` of each column to a dictionary.

`self.means = {column: mean}`

The `fit()` method will also assign the `standard deviation` of each column to a dictionary. 

`self.deviations = {column: deviation}`

The `transform()` method will then standardize each column according to the saved mean and standard deviations of each column.

In [24]:
continous_cols = ['body_length', 
                  'name_length', 
                  'sale_duration', 
                  'user_age', 
                  'event_duration']

In [30]:
"""
Instructions:
    The class should inherit traits from BaseEstimator and TransformerMixin (both imported from sklearn.base)
    If applicable, parameters need to be assigned in the __init__() method
    The name of param within the __init__(self, param) needs to match the name self.param exactly
    The fit method must return itself, even if there is no need to fit anything
    The fit( ) method must take in X and y, even though y may not be necessary. Set y=None.
    The transform( ) method take in only X.
    It is best practice to make a copy of X (X = X.copy()) in the tranform to avoid transforming the original dataframe.
"""

class Standardizer1():
    """Standardize continuous columns"""
    def __init__(self,):
        pass

    def fit(self,):
        pass
    
    def transform(self,):
        pass
    
    

### After finishing the class, use it below

In [None]:
standardizer = Standardizer1(continuous_cols=continous_cols)

In [None]:
standardizer.fit(X_dummified)

In [None]:
X_standardized = standardizer.transform(X_dummified)

Next, run the new data through your crude pipeline:

In [None]:
new_data = pd.read_csv('data/new_data.csv')

In [None]:
new_data_featurized = featurizer.transform(new_data)

In [None]:
new_data_imputed = imputer.transform(new_data_featurized)

In [None]:
new_data_dummified = dummifier.transform(new_data_imputed)

In [None]:
new_data_standardized = standardizer.transform(new_data_dummified)

In [23]:
# answer:

class Standardizer(BaseEstimator, TransformerMixin):
    """Standardize continuous columns"""
    def __init__(self, continuous_cols=None):
        self.continuous_cols = continuous_cols

    def fit(self, X, y=None):
        df = X
        self.means = {}
        self.deviations = {}
        for col in self.continous_cols:
            self.means[col] = df[col].mean()
            self.deviations[col] = df[col].std()
        return self
    
    def transform(self, X):
        df = X.copy()
        for col in self.continous_cols:
            df[col] = (df[col]-self.means[col])/self.standard_devs[col]
        return df
    
    