In [1]:
import pyodbc
import pandas as pd
import numpy as np
import joblib
from datetime import datetime, timedelta
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

pd.options.mode.chained_assignment = None  # default='warn'

# Introduction to Python Class

In [2]:
class Kendaraan:
    def __init__(self, tipe, owner, owner_ke):
        self.tipe_kendaraan = tipe
        self.owner = owner
        self.owner_ke = owner_ke
        
    def jual(self, new_owner):
        old_owner = self.owner
        self.owner = new_owner
        self.owner_ke += 1
        print("{0} dijual dari {1} ke {2}".format(self.tipe_kendaraan, old_owner, new_owner))

In [3]:
class Mobil(Kendaraan):
    def __init__(self, merk, pabrik, num_plate, yang_punya, pemilik_ke, initial_position=0):
        Kendaraan.__init__(self, tipe="Mobil", owner=yang_punya, owner_ke=pemilik_ke)
        self.num_plate=num_plate
        self.oem = pabrik
        self.brand = merk
        self.position = initial_position
        
    def maju(self, distance):
        print("----- Mobil {0}{1} maju".format(self.oem, self.brand))
        print("---------- initial position: {}".format(self.position))
        position = self.position + distance
        self.position = position
        print("---------- Now position: {}".format(self.position))
          
    def mundur(self, distance):
        print("----- Mobil {0}{1} mundur".format(self.oem, self.brand))
        print("---------- initial position: {}".format(self.position))
        self.position = self.position - distance
        print("---------- Now position: {}".format(self.position))

In [4]:
mobil_1 = Mobil(pabrik="Toyota", merk="Avanza", num_plate="B123ABC", yang_punya="Ridho", pemilik_ke=1)
mobil_2 = Mobil(pabrik="Suzuki", merk="APV", num_plate="B5678XYZ", yang_punya="Ridho", pemilik_ke=1)

In [5]:
mobil_1.maju(5)
mobil_1.maju(10)

----- Mobil ToyotaAvanza maju
---------- initial position: 0
---------- Now position: 5
----- Mobil ToyotaAvanza maju
---------- initial position: 5
---------- Now position: 15


In [6]:
mobil_1.jual(new_owner="Andin")

Mobil dijual dari Ridho ke Andin


# Getting Start with Real Data (Creating Sklearn Pipeline)

In [7]:
vhms_trend_hd785 = pd.read_excel('local_data/vhms_trend_hd785.xlsx')
print("VHMS Trend ",mark_timestamp())

VHMS Trend  2019-09-09 15:06:06


In [9]:
my_parameter = ["UNIT_SRL_NUM", "SMR", "ENG_SPEED_MX", "BLOWBY_PRESS_MX", "LF_EXH_TEMP_MX", "RF_EXH_TEMP_MX"]
vhms_trend_hd785_simple = vhms_trend_hd785[my_parameter]
vhms_trend_hd785_simple.head(100)

Unnamed: 0,UNIT_SRL_NUM,SMR,ENG_SPEED_MX,BLOWBY_PRESS_MX,LF_EXH_TEMP_MX,RF_EXH_TEMP_MX
0,10490,322.6,2357,3.69,688,669
1,10490,462.8,2432,3.87,692,673
2,10490,1043.7,2344,4.73,688,671
3,10490,2065.1,2425,4.75,684,667
4,1119,864.6,2573,3.30,723,724
5,1119,964.7,2475,3.51,740,738
6,30002,25.3,2469,2.29,699,662
7,30002,426.3,2380,2.68,727,697
8,30002,686.7,2517,2.71,733,702
9,30002,807.0,2483,2.83,718,692


## Creating Transformer Class

Kita mau bikin suatu pipeline pemrosesan data yang tahapannya adalah,
1. Memastikan tipe-tipe data setiap kolom benar
2. Mengganti data yang anomali menjadi nilai mean-nya (anomali: lebih dari 2 standard devaisi)
3. Melakukan prediksi

In [124]:
class EnsureDataTypes:
    # This Transformer takes schema as argument and will force data type as defined in schema
    # schema is list of tuple [(col_name_1, dtype_1), (col_name_2, dtype_2), ....]
    def __init__(self, skema):
        """
        kolom: lsit. [kolom 1, kolom 2, kolom 3, ...]
        tipe: list. [tipe kolom 1, tipe kolom 2, ....]
        """
        self.schema = skema
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        for pair in schema:
            col, tp = pair
            X[col] = X[col].astype(tp)
        return X
    
class AnomalyImputation:
    # This transformer is used to replace anomaly value to maximum allowed value
    # Init arguments
    #     features       : List of features to be transformaed
    #     z_threshold    : threshold of standard deviation beyond wich value is considered anomaly 
    def __init__(self, features, z_threshold=2):
        self.features = features
        self.threshold = z_threshold
        
    def fit(self, X, y=None):
        self.mean = X[features].mean()
        self.std = X[features].std()
        return self
        
    def transform(self, X, y=None):
        for f in self.features:
            mean = self.mean.get(f)
            std = self.std.get(f)
            z = (X[f]-mean)/std
            right_anomaly_index = z[z > self.threshold].index
            left_anomaly_index = z[z < -self.threshold].index
            X.loc[right_anomaly_index, f] = self.threshold*std + mean
            X.loc[left_anomaly_index, f] = -self.threshold*std + mean
        return X

class DeriveFeatures():
    # This transformer is used to do feature engineering
    # Init arguments:
    #    functions: list of function to be applied in dataframe
    def __init__(self, functions):
        self.functions = functions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if len(self.functions) > 1:
            for f in self.functions:
                f(X)
        else:
            self.functions[0](X)
        return X

In [201]:
def feature_engineering_function_1(X):
    X["FITUR_BARU_1"] = X["SMR"]*X["BLOWBY_PRESS_MX"]
    return X

def feature_engineering_function_2(X):
    X["FITUR_BARU_2"] = X["SMR"]/X["BLOWBY_PRESS_MX"]
    return X

In [204]:
schema  = [
    ("UNIT_SRL_NUM", str),
    ("SMR", int),
    ("ENG_SPEED_MX", float),
    ("BLOWBY_PRESS_MX", np.double)
]
features = ["ENG_SPEED_MX", "BLOWBY_PRESS_MX", "LF_EXH_TEMP_MX", "RF_EXH_TEMP_MX", "FITUR_BARU_1", "FITUR_BARU_2"]

data_preprocessing_pipeline = Pipeline([
    # first step is to ensure data type as defined in schema
    ("ensure-data-type", EnsureDataTypes(schema)),
    # second step is to do feature engineering. Functions to be applied is defined above
    ("feature-engineering", DeriveFeatures([feature_engineering_function_1, feature_engineering_function_2])),
    # Last step is to replace anomaly data
    ("anomaly-imputator", AnomalyImputation(features=features))
])

In [205]:
data_preprocessing_pipeline.fit(vhms_trend_hd785_simple)

Pipeline(memory=None,
         steps=[('ensure-data-type',
                 <__main__.EnsureDataTypes object at 0x000001B45BD0AE48>),
                ('feature-engineering',
                 <__main__.DeriveFeatures object at 0x000001B45BD0A8D0>),
                ('anomaly-imputator',
                 <__main__.AnomalyImputation object at 0x000001B45BD0AAC8>)],
         verbose=False)

# Scoring Script

In [207]:
data_baru = vhms_trend_hd785[my_parameter].head()
my_pipeline = joblib.load('belajar_data_preprocessing_pipeline.pkl')
my_pipeline.transform(data_baru)

Unnamed: 0,UNIT_SRL_NUM,SMR,ENG_SPEED_MX,BLOWBY_PRESS_MX,LF_EXH_TEMP_MX,RF_EXH_TEMP_MX,FITUR_BARU
0,10490,322,2357.0,3.69,688.0,669.0,1188.18
1,10490,462,2432.0,3.87,692.0,673.0,1787.94
2,10490,1043,2344.0,4.73,688.0,671.0,4933.39
3,10490,2065,2425.0,4.75,684.0,667.0,9808.75
4,1119,864,2573.0,3.3,723.0,724.0,2851.2


In [208]:
class PapOilDataCleanser():
    
    def __init__(self, unit_model, component, features):
        self.component = component
        self.unit_model = unit_model
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X[(X['MODL_NUM']==self.unit_model) & (X['COMPONENT']==self.component) & (X['HRS_KM_OC']>0)]
        X['SRL_NUM'] = X['SRL_NUM'].astype(str)
        X['SAMPL_DT'] = X['SAMPL_DT'].astype(str)
        return X[['LAB_NUM', 'SRL_NUM', 'MODL_NUM', 'COMPONENT', 'HRS_KM_OC', 'HRS_KM_TOT', 'SAMPL_DT']+self.features] 