In [1]:
import datetime
import numpy as np
import pandas as pd
import typing as t

In [2]:
# Creating the timestamps for the index
current_date = datetime.datetime.now()
previous_90d_date = current_date - datetime.timedelta(days = 90)
start_date = datetime.datetime(year = previous_90d_date.year, 
                              month = previous_90d_date.month,
                              day = 1)

index = pd.date_range(start = start_date, end = current_date, freq = '4H')

In [3]:
# Generating the sample points
samples = np.random.normal(loc=10, scale=3, size=(len(index), 2))

# Creating the dataframe 
data = pd.DataFrame(data=samples, index=index, columns=['Circuit 1', 'Circuit 2'])

# Randomly drop some samples 
drop_pct = 0.1
drop_index = np.random.choice([*range(len(index))], size=int(len(index) * drop_pct), replace=False)
data.iloc[drop_index] = np.nan

# Printing the resulting dataframe
data.head()

Unnamed: 0,Circuit 1,Circuit 2
2022-03-01 00:00:00,8.26321,8.527027
2022-03-01 04:00:00,,
2022-03-01 08:00:00,9.571511,15.055005
2022-03-01 12:00:00,12.140461,9.569622
2022-03-01 16:00:00,10.647762,6.075545


In [4]:
def calculate_monthly_cap_index(df):
    
    # Grouping by business month start frequency
    groups = df.groupby(pd.Grouper(freq='BMS'))
    
    def ppi(x):
        return (np.mean(x) - spec_limits['LSL']) / np.std(x) / 3

    def pps(x):
        return (spec_limits['USL'] - np.mean(x) ) / np.std(x) / 3

    spec_limits = {
        'LSL': 6.0,
        'USL': 12.0
    }
    
    monthly_ppk = groups.agg(['mean', 'std', ppi, pps])
    
    for circ in monthly_ppk.columns.get_level_values(level=0):
        monthly_ppk[(circ, 'PPK')] = monthly_ppk[zip(2*[circ], ['ppi', 'pps'])].min(axis=1)

    #monthly_ppk.index = [item.strftime(format='%m.%Y') for item in monthly_ppk.index]

    return monthly_ppk

In [5]:
monthly_ppk = calculate_monthly_cap_index(data)

monthly_ppk

Unnamed: 0_level_0,Circuit 1,Circuit 1,Circuit 1,Circuit 1,Circuit 2,Circuit 2,Circuit 2,Circuit 2,Circuit 1,Circuit 2
Unnamed: 0_level_1,mean,std,ppi,pps,mean,std,ppi,pps,PPK,PPK
2022-03-01,10.282588,3.356735,0.426592,0.171073,9.851812,2.653365,0.485391,0.270706,0.171073,0.270706
2022-04-01,10.288696,2.885641,0.496897,0.198275,9.818278,2.979286,0.428488,0.244833,0.198275,0.244833
2022-05-02,9.850892,3.070798,0.419269,0.233986,9.57318,3.168519,0.377035,0.256073,0.233986,0.256073
2022-06-01,10.013688,3.128666,0.430257,0.212927,10.461816,2.9448,0.508158,0.175184,0.212927,0.175184


## 1. Creating a class to the production process data

In [6]:
class ProcessData():
    def __init__(self,
                plant_name: str,
                circuit_names: t.Sequence[str],
                specifications_limits: dict,
                data: pd.DataFrame = None):
        self.plant_name = plant_name
        
        self.data = data
        self.specifications_limits = specifications_limits
        
        if isinstance(circuit_names, list):
            self.circuit_names = circuit_names 
        else:
            self.circuit_names = [circuit_names]
            
        self._check_for_specifications_limits()

        if data is None:
            self.data = self._create_sample_data()
            
    def _check_for_specifications_limits(self):
        set_circ_names = set(self.circuit_names)
        set_circ_spec_lim = set(self.specifications_limits.keys())
        
        if set_circ_names.difference(set_circ_spec_lim) != set():
            raise OSError("There are missing values for specification limits for the circuit(s): ", ", ".join(set_circ_names.difference(set_circ_spec_lim)))
        if set_circ_spec_lim.difference(set_circ_names) != set():
            raise OSError("There are extra values of specification limits for the circuit(s): ", ", ".join(set_circ_spec_lim.difference(set_circ_names)))

    def _create_sample_data(self) -> pd.DataFrame:
        
        # Creating the timestamps for the index
        current_date = datetime.datetime.now()
        previous_90d_date = current_date - datetime.timedelta(days = 90)
        start_date = datetime.datetime(year = previous_90d_date.year,
                                      month = previous_90d_date.month,
                                      day = 1)
        index = pd.date_range(start = start_date, end = current_date, freq = '4H')
        
        # Creating the dataframe
        data = pd.DataFrame(index=index)
        
        for circ in self.circuit_names:

            # Generating the sample points
            lls,uls  = self.specifications_limits[circ]['LLS'], self.specifications_limits[circ]['ULS'] 
            mu = (lls + uls) / 2 + np.random.uniform(0.0, 15.0)
            sigma = np.random.uniform(0.0, 15.0)
            
            samples = np.random.normal(loc=mu, scale=sigma, size=len(index))

            data_ = pd.DataFrame(data=samples, index=index, columns=[circ])

            # Randomly drop some samples
            drop_pct = 0.1
            drop_index = np.random.choice([*range(len(index))], size=int(len(index) * drop_pct), replace=False)
            data_.iloc[drop_index] = np.nan
            
            data = pd.concat([data, data_], axis=1, ignore_index=False)

        return data


In [7]:
data_plant_A = ProcessData(
    plant_name='Plant A',
    circuit_names = 'Circuit 1',
    specifications_limits = {'Circuit 1': {'LLS': 60.0, 'ULS': 70.0}}
)

In [8]:
data_plant_A.data

Unnamed: 0,Circuit 1
2022-03-01 00:00:00,66.738577
2022-03-01 04:00:00,67.118998
2022-03-01 08:00:00,64.533634
2022-03-01 12:00:00,54.723189
2022-03-01 16:00:00,54.392932
...,...
2022-06-15 04:00:00,57.164190
2022-06-15 08:00:00,65.488112
2022-06-15 12:00:00,61.297293
2022-06-15 16:00:00,66.263416


In [9]:
data_plant_A = ProcessData(
    plant_name='Plant A',
    circuit_names = ['Circuit 1', 'Circuit 2', 'Circuit 3'],
    specifications_limits = {'Circuit 1': {'LLS': 60.0, 'ULS': 70.0},
                             'Circuit 2': {'LLS': 40.0, 'ULS': 70.0},
                             'Circuit 3': {'LLS': 85.0, 'ULS': 90.0}}
)

In [10]:
data_plant_A.data

Unnamed: 0,Circuit 1,Circuit 2,Circuit 3
2022-03-01 00:00:00,68.835712,97.103731,101.705497
2022-03-01 04:00:00,77.581664,81.259132,99.218057
2022-03-01 08:00:00,70.332194,52.585725,101.434448
2022-03-01 12:00:00,72.475250,82.247074,99.858709
2022-03-01 16:00:00,57.302515,79.004953,99.238456
...,...,...,...
2022-06-15 04:00:00,65.994770,52.756130,
2022-06-15 08:00:00,73.633695,45.768161,100.977074
2022-06-15 12:00:00,,71.337220,98.719620
2022-06-15 16:00:00,71.741556,74.959901,101.457982


Reformulating the function to calculate the ppk

In [11]:
def calculate_cap_index_ppk(process_data_obj, freq='BMS'):
    
    # Grouping by business month start frequency
    groups = process_data_obj.data.groupby(pd.Grouper(freq=freq))
    
    def ppi(x):
        return (np.mean(x) - spec_limits['LLS']) / np.std(x) / 3

    def pps(x):
        return (spec_limits['ULS'] - np.mean(x) ) / np.std(x) / 3
    
    capidx_ppk = pd.DataFrame(index=groups.groups.keys())
    
    for circ in process_data_obj.circuit_names:
        spec_limits = process_data_obj.specifications_limits[circ]
        
        capidx_ppk_ = groups[[circ]].agg(['count','mean', 'std', ppi, pps])        
        capidx_ppk_[(circ, 'PPK')] = capidx_ppk_[zip(2*[circ], ['ppi', 'pps'])].min(axis=1)
        
        capidx_ppk = pd.concat([capidx_ppk, capidx_ppk_], axis=1, ignore_index=False)

    return capidx_ppk

In [12]:
monthly_ppk = calculate_cap_index_ppk(data_plant_A)

In [13]:
monthly_ppk

Unnamed: 0,"(Circuit 1, count)","(Circuit 1, mean)","(Circuit 1, std)","(Circuit 1, ppi)","(Circuit 1, pps)","(Circuit 1, PPK)","(Circuit 2, count)","(Circuit 2, mean)","(Circuit 2, std)","(Circuit 2, ppi)","(Circuit 2, pps)","(Circuit 2, PPK)","(Circuit 3, count)","(Circuit 3, mean)","(Circuit 3, std)","(Circuit 3, ppi)","(Circuit 3, pps)","(Circuit 3, PPK)"
2022-03-01,172,67.996334,5.988911,0.446363,0.111846,0.111846,168,66.12226,13.952004,0.625964,0.092922,0.092922,167,101.747499,2.203207,2.541426,-1.782678,-1.782678
2022-04-01,165,68.512968,6.382107,0.44598,0.077903,0.077903,166,67.891075,14.799015,0.63012,0.047645,0.047645,167,101.431123,2.219575,2.47503,-1.721877,-1.721877
2022-05-02,159,68.152805,5.821597,0.468289,0.106101,0.106101,161,66.065823,15.810482,0.551262,0.083203,0.083203,167,102.050939,2.15908,2.640356,-1.866101,-1.866101
2022-06-01,82,68.644528,6.382761,0.45423,0.071224,0.071224,83,65.397352,15.071277,0.565131,0.102416,0.102416,77,102.13531,2.326112,2.471603,-1.750401,-1.750401


In [14]:
daily_ppk = calculate_cap_index_ppk(data_plant_A, freq='D')

In [15]:
daily_ppk

Unnamed: 0,"(Circuit 1, count)","(Circuit 1, mean)","(Circuit 1, std)","(Circuit 1, ppi)","(Circuit 1, pps)","(Circuit 1, PPK)","(Circuit 2, count)","(Circuit 2, mean)","(Circuit 2, std)","(Circuit 2, ppi)","(Circuit 2, pps)","(Circuit 2, PPK)","(Circuit 3, count)","(Circuit 3, mean)","(Circuit 3, std)","(Circuit 3, ppi)","(Circuit 3, pps)","(Circuit 3, PPK)"
2022-03-01,6,70.268173,7.094966,0.528460,-0.013802,-0.013802,6,73.500678,18.822765,0.649890,-0.067911,-0.067911,6,100.793720,1.633097,3.531359,-2.413396,-2.413396
2022-03-02,6,69.031879,4.183001,0.788423,0.084511,0.084511,4,71.685141,6.979331,1.747391,-0.092933,-0.092933,5,100.970516,1.657956,3.589878,-2.465970,-2.465970
2022-03-03,6,65.272462,6.274012,0.306858,0.275143,0.275143,5,65.919325,14.085073,0.685801,0.107971,0.107971,6,103.935258,2.499498,2.766227,-2.035784,-2.035784
2022-03-04,5,65.063560,6.856043,0.275243,0.268333,0.268333,6,69.945450,11.277023,0.969629,0.001766,0.001766,4,100.545460,1.856472,3.223022,-2.186378,-2.186378
2022-03-05,6,65.979093,5.233071,0.417204,0.280567,0.280567,6,63.831562,13.081105,0.665239,0.172187,0.172187,6,102.129234,1.436073,4.355427,-3.084084,-3.084084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-11,5,68.274573,6.564625,0.469753,0.097954,0.097954,6,72.189580,16.268704,0.722490,-0.049145,-0.049145,6,101.542168,2.039297,2.961975,-2.066695,-2.066695
2022-06-12,6,66.338374,4.611986,0.501833,0.289905,0.289905,6,70.666819,16.924528,0.661640,-0.014387,-0.014387,5,101.411042,3.347544,1.827022,-1.270378,-1.270378
2022-06-13,6,67.467397,10.424098,0.261577,0.088715,0.088715,6,73.250267,5.623820,2.158903,-0.211036,-0.211036,4,101.968989,3.750117,1.741644,-1.228459,-1.228459
2022-06-14,6,72.702139,6.934480,0.668856,-0.142286,-0.142286,5,63.389332,12.312744,0.707940,0.200089,0.200089,6,103.184014,0.893599,7.430473,-5.387340,-5.387340


In [21]:
data_plant_A.specifications_limits['Circuit 1'].values()

dict_values([60.0, 70.0])

In [22]:
import datetime

In [26]:
datetime.date(year=2020, month=12, day=1)

datetime.date(2020, 12, 1)