In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
import numpy as np
import math

In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings("ignore")

In [4]:
path = './data/'

In [5]:
encodings = ['cp949', 'euc-kr', 'utf-8']

def get_df(name):
    for encoding in encodings:
        try:
            df = pd.read_csv(path + f'{name}', encoding=encoding)
            break
        except UnicodeDecodeError:
            continue
    return df

## 데이터 확인

In [6]:
data = get_df('data.csv')

In [7]:
data.head()

Unnamed: 0,TAG_MIN,배정번호,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,...,소입로 CP 값,소입로 CP 모니터 값,소입로 온도 1 Zone,소입로 온도 2 Zone,소입로 온도 3 Zone,소입로 온도 4 Zone,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,솔트조 온도 2 Zone
0,2022-01-03 11:22:07,102410,75.6648,30.0155,,,68.4386,72.8403,59.7862,51.7169,...,0.450421,,,859.854,,,,,,
1,2022-01-03 11:22:08,102410,75.6706,32.2732,,,68.4386,78.4415,61.6286,50.4453,...,0.450356,1.14626e-10,860.338,859.78,860.044,859.786,294.658,272.538,328.734,328.865
2,2022-01-03 11:22:09,102410,75.6776,32.1592,98.8533,99.146,68.4386,78.1099,61.5414,52.0196,...,0.450341,1.1452e-10,860.338,859.78,859.981,859.724,294.658,272.538,328.734,328.805
3,2022-01-03 11:22:11,102410,75.8656,30.8312,98.7918,99.17675,68.4999,77.50725,60.6663,52.69425,...,0.450201,1.14467e-10,860.338,859.842,859.95,859.599,294.719,272.538,328.674,328.865
4,2022-01-03 11:22:12,102410,73.6468,29.5274,98.7918,99.2075,68.4386,76.0262,61.1634,51.6915,...,0.450235,1.14536e-10,860.351,859.791,859.991,859.731,294.721,272.599,328.74,328.808


In [8]:
column_names_1 = {'배정번호': 'AN', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP'}

data.rename(columns=column_names_1, inplace=True)

In [9]:
data.describe()

Unnamed: 0,AN,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
count,2939722.0,2939721.0,2939721.0,2939606.0,2939574.0,2939631.0,2935434.0,2939722.0,2939720.0,2939719.0,2939721.0,2939575.0,2939592.0,2939594.0,2939565.0,2939552.0,2939616.0,2939580.0,2939513.0,2939519.0
mean,128442.2,69.89404,20.44708,100.0061,100.0198,67.71864,75.64373,54.86239,53.86029,71.08925,0.4488618,1.14558e-10,859.2077,860.0021,860.0029,860.0062,283.9963,279.9293,331.8062,332.1773
std,12637.04,4.014802,5.217123,0.4360371,0.3623526,1.630768,25.16083,4.429079,2.664304,2.556959,0.01886477,6.012797e-13,3.647667,0.557848,0.3518205,0.4551778,9.51277,6.611579,0.7827379,0.8732977
min,102410.0,47.2532,0.000118883,97.3421,97.8706,60.6244,0.000850055,8.62001,0.0437045,0.0062442,0.00509637,1.11662e-10,840.298,855.929,858.28,857.992,266.23,266.426,328.161,328.073
25%,119448.0,68.4288,18.9176,99.8144,99.8901,66.5694,64.9627,53.3259,52.3891,69.6781,0.4484415,1.142615e-10,857.949,859.776,859.829,859.843,274.754,273.502,331.867,332.178
50%,129889.0,70.5166,21.2931,100.002,100.019,67.6972,82.2104,55.6654,53.8862,71.0454,0.450062,1.14555e-10,859.575,860.022,860.002,860.0,284.586,280.02,332.017,332.423
75%,139116.0,72.3781,23.3884,100.191,100.161,68.9799,95.3666,57.5733,55.4145,72.4771,0.451707,1.14844e-10,860.258,860.249,860.172,860.158,293.343,286.334,332.141,332.626
max,148069.0,87.2995,47.5395,102.469,101.843,71.4901,100.0,77.2709,66.015,87.3907,0.909111,1.32929e-10,877.228,866.034,870.119,882.148,298.53,291.696,332.717,333.179


In [10]:
quality = pd.read_excel(path+'quality.xlsx')

In [11]:
quality.head()

Unnamed: 0,배정번호,작업일,공정명,설비명,양품수량,불량수량,총수량
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981


In [12]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   배정번호    136 non-null    int64         
 1   작업일     136 non-null    datetime64[ns]
 2   공정명     136 non-null    object        
 3   설비명     136 non-null    object        
 4   양품수량    136 non-null    int64         
 5   불량수량    136 non-null    int64         
 6   총수량     136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [13]:
column_names_2 = {'배정번호': 'AN', '작업일': 'WD', '공정명': 'PN'
                ,'설비명': 'EN', '양품수량': 'GQ', '불량수량': 'BQ'
                ,'총수량': 'TQ'}

quality.rename(columns=column_names_2, inplace=True)

In [14]:
quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   AN      136 non-null    int64         
 1   WD      136 non-null    datetime64[ns]
 2   PN      136 non-null    object        
 3   EN      136 non-null    object        
 4   GQ      136 non-null    int64         
 5   BQ      136 non-null    int64         
 6   TQ      136 non-null    int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 7.6+ KB


In [15]:
quality.describe()

Unnamed: 0,AN,GQ,BQ,TQ
count,136.0,136.0,136.0,136.0
mean,128897.191176,45012.301471,15.117647,45027.419118
std,12403.393924,25548.197992,18.549657,25554.54871
min,102410.0,8412.0,0.0,8414.0
25%,120467.75,29736.0,4.0,29755.0
50%,130199.0,44003.0,8.5,44020.5
75%,138982.5,60120.75,17.0,60135.5
max,148069.0,104740.0,120.0,104761.0


In [16]:
quality.isnull().sum()

AN    0
WD    0
PN    0
EN    0
GQ    0
BQ    0
TQ    0
dtype: int64

In [17]:
train = get_df('train.csv')

In [18]:
train.head()

Unnamed: 0.1,Unnamed: 0,건조 1존 OP_Avg,건조 1존 OP_Std,건조 2존 OP_Avg,건조 2존 OP_Std,건조로 온도 1 Zone_Avg,건조로 온도 1 Zone_Std,건조로 온도 2 Zone_Avg,건조로 온도 2 Zone_Std,세정기_Avg,...,소입로 온도 4 Zone_Std,솔트 컨베이어 온도 1 Zone_Avg,솔트 컨베이어 온도 1 Zone_Std,솔트 컨베이어 온도 2 Zone_Avg,솔트 컨베이어 온도 2 Zone_Std,솔트조 온도 1 Zone_Avg,솔트조 온도 1 Zone_Std,솔트조 온도 2 Zone_Avg,솔트조 온도 2 Zone_Std,불량단계
0,97,69.497726,3.274577,20.310463,3.490991,99.999143,0.435237,100.001123,0.401438,67.864965,...,0.304168,284.699659,9.60185,280.411936,6.940009,332.111266,0.152253,332.712474,0.153026,안정
1,125,68.7767,3.548587,16.547672,4.161717,100.07776,0.394062,100.107134,0.291589,69.61422,...,0.3037,285.00715,9.239152,280.646734,6.650701,332.123215,0.225985,332.093658,0.329912,위험
2,11,73.502913,2.645737,21.218347,2.218216,100.006615,0.387797,99.992686,0.281373,66.220995,...,0.40205,283.120448,9.426413,279.110908,6.064772,332.182887,0.099322,332.407261,0.10744,안정
3,129,68.062513,3.439085,4.366498,3.727635,100.040387,0.418439,100.139576,0.334492,69.242707,...,0.360325,285.074759,9.475964,280.790056,6.699134,332.277923,0.121808,332.261568,0.184196,안정
4,48,68.820299,3.946638,19.902113,3.765778,100.07193,0.367971,100.097453,0.291788,65.512487,...,0.211795,284.166005,9.847216,279.587268,6.606966,331.943223,0.189118,332.503069,0.359428,안정


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             108 non-null    int64  
 1   건조 1존 OP_Avg           108 non-null    float64
 2   건조 1존 OP_Std           108 non-null    float64
 3   건조 2존 OP_Avg           108 non-null    float64
 4   건조 2존 OP_Std           108 non-null    float64
 5   건조로 온도 1 Zone_Avg      108 non-null    float64
 6   건조로 온도 1 Zone_Std      108 non-null    float64
 7   건조로 온도 2 Zone_Avg      108 non-null    float64
 8   건조로 온도 2 Zone_Std      108 non-null    float64
 9   세정기_Avg                108 non-null    float64
 10  세정기_Std                108 non-null    float64
 11  소입1존 OP_Avg            108 non-null    float64
 12  소입1존 OP_Std            108 non-null    float64
 13  소입2존 OP_Avg            108 non-null    float64
 14  소입2존 OP_Std            108 non-null    float64
 15  소입3존 O

In [20]:
column_names_3 = {'Unnamed: 0' : 'UNNAMED', '건조 1존 OP': 'DZ1_OP', '건조 2존 OP': 'DZ2_OP'
                ,'건조로 온도 1 Zone': 'DZ1_TEMP', '건조로 온도 2 Zone': 'DZ2_TEMP', '세정기': 'CLEAN'
                ,'소입1존 OP': 'HDZ1_OP', '소입2존 OP': 'HDZ2_OP', '소입3존 OP': 'HDZ3_OP'
                ,'소입4존 OP': 'HDZ4_OP', '소입로 CP 값': 'HDZ_CP', '소입로 CP 모니터 값 ': 'HDZ_CPM'
                ,'소입로 온도 1 Zone': 'HDZ1_TEMP', '소입로 온도 2 Zone': 'HDZ2_TEMP', '소입로 온도 3 Zone': 'HDZ3_TEMP'
                ,'소입로 온도 4 Zone': 'HDZ4_TEMP', '솔트 컨베이어 온도 1 Zone': 'SCZ1_TEMP', '솔트 컨베이어 온도 2 Zone': 'SCZ2_TEMP'
                ,'솔트조 온도 1 Zone': 'STZ1_TEMP', '솔트조 온도 2 Zone': 'STZ2_TEMP', '불량단계' : 'FS'}

column_names_combined = {}
for key, value in column_names_3.items():
    column_names_combined[key] = value
    column_names_combined[f'{key}_Avg'] = f'{value}_Avg'
    column_names_combined[f'{key}_Std'] = f'{value}_Std'
    
train.rename(columns=column_names_combined, inplace=True)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   UNNAMED        108 non-null    int64  
 1   DZ1_OP_Avg     108 non-null    float64
 2   DZ1_OP_Std     108 non-null    float64
 3   DZ2_OP_Avg     108 non-null    float64
 4   DZ2_OP_Std     108 non-null    float64
 5   DZ1_TEMP_Avg   108 non-null    float64
 6   DZ1_TEMP_Std   108 non-null    float64
 7   DZ2_TEMP_Avg   108 non-null    float64
 8   DZ2_TEMP_Std   108 non-null    float64
 9   CLEAN_Avg      108 non-null    float64
 10  CLEAN_Std      108 non-null    float64
 11  HDZ1_OP_Avg    108 non-null    float64
 12  HDZ1_OP_Std    108 non-null    float64
 13  HDZ2_OP_Avg    108 non-null    float64
 14  HDZ2_OP_Std    108 non-null    float64
 15  HDZ3_OP_Avg    108 non-null    float64
 16  HDZ3_OP_Std    108 non-null    float64
 17  HDZ4_OP_Avg    108 non-null    float64
 18  HDZ4_OP_St

## 데이터 전처리

### 날짜 변환

In [22]:
data['TAG_MIN'] = pd.to_datetime(data['TAG_MIN'])

In [23]:
data['Year'] = data['TAG_MIN'].dt.year
data['Month'] = data['TAG_MIN'].dt.month
data['Day'] = data['TAG_MIN'].dt.day
data['Hour'] = data['TAG_MIN'].dt.hour
data['Minute'] = data['TAG_MIN'].dt.minute
data['Second'] = data['TAG_MIN'].dt.second

### 결측치 처리

In [24]:
data.isnull().sum()

TAG_MIN         0
AN              0
DZ1_OP          1
DZ2_OP          1
DZ1_TEMP      116
DZ2_TEMP      148
CLEAN          91
HDZ1_OP      4288
HDZ2_OP         0
HDZ3_OP         2
HDZ4_OP         3
HDZ_CP          1
HDZ_CPM       147
HDZ1_TEMP     130
HDZ2_TEMP     128
HDZ3_TEMP     157
HDZ4_TEMP     170
SCZ1_TEMP     106
SCZ2_TEMP     142
STZ1_TEMP     209
STZ2_TEMP     203
Year            0
Month           0
Day             0
Hour            0
Minute          0
Second          0
dtype: int64

In [25]:
data = data.fillna(data.mean())

In [26]:
data.isnull().sum()

TAG_MIN      0
AN           0
DZ1_OP       0
DZ2_OP       0
DZ1_TEMP     0
DZ2_TEMP     0
CLEAN        0
HDZ1_OP      0
HDZ2_OP      0
HDZ3_OP      0
HDZ4_OP      0
HDZ_CP       0
HDZ_CPM      0
HDZ1_TEMP    0
HDZ2_TEMP    0
HDZ3_TEMP    0
HDZ4_TEMP    0
SCZ1_TEMP    0
SCZ2_TEMP    0
STZ1_TEMP    0
STZ2_TEMP    0
Year         0
Month        0
Day          0
Hour         0
Minute       0
Second       0
dtype: int64

## 데이터 병합

In [27]:
columns_to_exclude = ['AN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']  
included_columns = [col for col in data.columns if col not in columns_to_exclude]

data_stat = data.groupby('AN')[included_columns].agg(['mean', 'std'])
data_stat

Unnamed: 0_level_0,DZ1_OP,DZ1_OP,DZ2_OP,DZ2_OP,DZ1_TEMP,DZ1_TEMP,DZ2_TEMP,DZ2_TEMP,CLEAN,CLEAN,...,HDZ4_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ1_TEMP,SCZ2_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ1_TEMP,STZ2_TEMP,STZ2_TEMP
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
AN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
102410,72.252727,3.696537,21.354464,4.348928,99.943477,0.593891,100.061884,0.483452,69.602560,0.845406,...,860.010591,0.553425,282.581496,9.371121,280.148983,6.033861,329.016320,0.126985,329.070890,0.122030
102585,72.235643,3.365000,18.602563,2.859741,99.987431,0.515429,100.065032,0.356142,69.591183,1.064200,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
102930,70.720207,3.231776,20.911928,2.582097,99.995593,0.472719,100.021641,0.343024,69.529495,1.097923,...,860.007487,0.418431,283.330852,9.680439,279.308958,6.665152,329.133610,0.121207,329.148768,0.117028
103142,72.424229,2.635245,22.250186,2.402781,100.005056,0.331393,100.009675,0.251783,69.536860,1.064329,...,860.003481,0.296744,282.882341,9.495495,279.241101,6.537439,329.082055,0.100028,329.073201,0.101987
103675,72.774648,4.159221,21.865151,3.622806,99.983502,0.655347,100.043710,0.470749,69.320977,0.991705,...,860.007201,0.571169,283.581648,9.705562,277.544769,5.365901,329.010867,0.096689,329.114543,0.087897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147292,69.486127,3.123360,4.376536,2.514362,100.019260,0.381383,100.079487,0.273451,69.639903,0.764689,...,859.994795,0.328179,284.629458,9.221712,280.419441,6.696058,332.184151,0.110639,332.210098,0.129176
147546,69.718808,2.662344,17.836625,3.098225,100.024207,0.314547,100.005331,0.259923,70.128396,0.499345,...,859.998661,0.206355,284.236512,9.317357,279.987623,6.612269,332.155742,0.173038,332.162466,0.260713
147982,69.799029,3.164459,17.913929,3.446134,100.028487,0.347958,100.026178,0.323286,69.695840,1.139830,...,860.002396,0.313149,284.190848,9.223516,279.977799,6.537935,332.209259,0.096274,332.125707,0.118017
147996,69.991809,3.564122,16.868628,5.317683,99.990732,0.381893,100.002368,0.378060,69.460694,1.019985,...,859.991821,0.343028,284.781458,9.270749,280.511153,6.783697,332.150655,0.158543,332.088618,0.237818


In [28]:
chg_name = {'mean': '_Avg', 'std': '_Std'}
data_stat.columns = list(map(lambda x: x[0] + chg_name[x[1]], data_stat.columns))
data_stat.reset_index(drop=False, inplace=True)

In [29]:
data_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 39 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AN             136 non-null    int64  
 1   DZ1_OP_Avg     136 non-null    float64
 2   DZ1_OP_Std     136 non-null    float64
 3   DZ2_OP_Avg     136 non-null    float64
 4   DZ2_OP_Std     136 non-null    float64
 5   DZ1_TEMP_Avg   136 non-null    float64
 6   DZ1_TEMP_Std   136 non-null    float64
 7   DZ2_TEMP_Avg   136 non-null    float64
 8   DZ2_TEMP_Std   136 non-null    float64
 9   CLEAN_Avg      136 non-null    float64
 10  CLEAN_Std      136 non-null    float64
 11  HDZ1_OP_Avg    136 non-null    float64
 12  HDZ1_OP_Std    136 non-null    float64
 13  HDZ2_OP_Avg    136 non-null    float64
 14  HDZ2_OP_Std    136 non-null    float64
 15  HDZ3_OP_Avg    136 non-null    float64
 16  HDZ3_OP_Std    136 non-null    float64
 17  HDZ4_OP_Avg    136 non-null    float64
 18  HDZ4_OP_St

In [30]:
df_total = pd.merge(quality, data_stat, on='AN', how='left')
df_total.head()

Unnamed: 0,AN,WD,PN,EN,GQ,BQ,TQ,DZ1_OP_Avg,DZ1_OP_Std,DZ2_OP_Avg,...,HDZ4_TEMP_Avg,HDZ4_TEMP_Std,SCZ1_TEMP_Avg,SCZ1_TEMP_Std,SCZ2_TEMP_Avg,SCZ2_TEMP_Std,STZ1_TEMP_Avg,STZ1_TEMP_Std,STZ2_TEMP_Avg,STZ2_TEMP_Std
0,102410,2022-01-03,열처리,열처리 염욕_1,15160,3,15163,72.252727,3.696537,21.354464,...,860.010591,0.553425,282.581496,9.371121,280.148983,6.033861,329.01632,0.126985,329.07089,0.12203
1,102585,2022-01-03,열처리,열처리 염욕_1,29892,10,29902,72.235643,3.365,18.602563,...,859.991765,0.480499,282.788156,9.499565,279.772316,7.161542,328.998615,0.101219,328.924151,0.089118
2,102930,2022-01-04,열처리,열처리 염욕_1,59616,30,59646,70.720207,3.231776,20.911928,...,860.007487,0.418431,283.330852,9.680439,279.308958,6.665152,329.13361,0.121207,329.148768,0.117028
3,103142,2022-01-05,열처리,열처리 염욕_1,74730,13,74743,72.424229,2.635245,22.250186,...,860.003481,0.296744,282.882341,9.495495,279.241101,6.537439,329.082055,0.100028,329.073201,0.101987
4,103675,2022-01-06,열처리,열처리 염욕_1,14979,2,14981,72.774648,4.159221,21.865151,...,860.007201,0.571169,283.581648,9.705562,277.544769,5.365901,329.010867,0.096689,329.114543,0.087897


## 시계열 포함된 품질보증 데이터

In [78]:
df_total['BQ Rate'] = round(df_total['BQ'] / df_total['TQ'] * 100, 3)
df_total['DS'] = np.where(df_total['BQ Rate'] < 0.046, 0, 1)

In [79]:
df_total.drop(['AN', 'PN', 'EN', 'GQ', 'BQ', 'TQ', 'BQ Rate'], axis=1, inplace=True)

In [80]:
df_total['Year'] = df_total['WD'].dt.year
df_total['Month'] = df_total['WD'].dt.month
df_total['Day'] = df_total['WD'].dt.day

In [81]:
df_total.drop(['WD'], axis=1, inplace=True)

In [82]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   DZ1_OP_Avg     136 non-null    float64
 1   DZ1_OP_Std     136 non-null    float64
 2   DZ2_OP_Avg     136 non-null    float64
 3   DZ2_OP_Std     136 non-null    float64
 4   DZ1_TEMP_Avg   136 non-null    float64
 5   DZ1_TEMP_Std   136 non-null    float64
 6   DZ2_TEMP_Avg   136 non-null    float64
 7   DZ2_TEMP_Std   136 non-null    float64
 8   CLEAN_Avg      136 non-null    float64
 9   CLEAN_Std      136 non-null    float64
 10  HDZ1_OP_Avg    136 non-null    float64
 11  HDZ1_OP_Std    136 non-null    float64
 12  HDZ2_OP_Avg    136 non-null    float64
 13  HDZ2_OP_Std    136 non-null    float64
 14  HDZ3_OP_Avg    136 non-null    float64
 15  HDZ3_OP_Std    136 non-null    float64
 16  HDZ4_OP_Avg    136 non-null    float64
 17  HDZ4_OP_Std    136 non-null    float64
 18  HDZ_CP_Avg

In [83]:
df_total['DS'].value_counts()

0    101
1     35
Name: DS, dtype: int64

In [84]:
df_total.to_csv('df_total.csv', index=False)

## 품질보증 원인설비 파악

- 함수에 들어가는 데이터는 분단위 데이터로 가정

In [54]:
df1=data.copy()

In [57]:
df1.drop(['TAG_MIN', 'AN', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], axis=1, inplace=True)

In [59]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2939722 entries, 0 to 2939721
Data columns (total 19 columns):
 #   Column     Dtype  
---  ------     -----  
 0   DZ1_OP     float64
 1   DZ2_OP     float64
 2   DZ1_TEMP   float64
 3   DZ2_TEMP   float64
 4   CLEAN      float64
 5   HDZ1_OP    float64
 6   HDZ2_OP    float64
 7   HDZ3_OP    float64
 8   HDZ4_OP    float64
 9   HDZ_CP     float64
 10  HDZ_CPM    float64
 11  HDZ1_TEMP  float64
 12  HDZ2_TEMP  float64
 13  HDZ3_TEMP  float64
 14  HDZ4_TEMP  float64
 15  SCZ1_TEMP  float64
 16  SCZ2_TEMP  float64
 17  STZ1_TEMP  float64
 18  STZ2_TEMP  float64
dtypes: float64(19)
memory usage: 426.1 MB


In [177]:
def Bad_Cause(df):
    outlier_count = pd.DataFrame()
    outlier_label_dict = {}
    
    for column in df.columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = 1.5 * (q3 - q1)
        
        outlier_mask = (df[column] < (q1 - iqr)) | (df[column] > (q3 + iqr))
        outlier_count[column] = outlier_mask.astype(int)
    
    for column in outlier_count.columns:
        outlier_percentage = outlier_count[column].sum() / len(outlier_count)
        if outlier_percentage >= 0.4:
            outlier_label_dict[column] = 1
        else:
            outlier_label_dict[column] = 0

    outlier_label = pd.DataFrame(outlier_label_dict, index=[0])
    
    for column in outlier_label.columns:
        if outlier_label[column][0] != 0:
            print(f'{column}설비에 이상이 감지되었습니다.')
            
    return outlier_label

In [181]:
Bad_Cause(df1.iloc[0:60])

Unnamed: 0,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [182]:
Bad_Cause(df1.iloc[60:120])

HDZ_CPM설비에 이상이 감지되었습니다.


Unnamed: 0,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [183]:
Bad_Cause(df1.iloc[360:420])

HDZ2_TEMP설비에 이상이 감지되었습니다.


Unnamed: 0,DZ1_OP,DZ2_OP,DZ1_TEMP,DZ2_TEMP,CLEAN,HDZ1_OP,HDZ2_OP,HDZ3_OP,HDZ4_OP,HDZ_CP,HDZ_CPM,HDZ1_TEMP,HDZ2_TEMP,HDZ3_TEMP,HDZ4_TEMP,SCZ1_TEMP,SCZ2_TEMP,STZ1_TEMP,STZ2_TEMP
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [155]:
# 위의 outlier_percentage를 정의하기 위한 코드
# chunk_size = 60
# count = 0
# for i in range(0, int(len(df1)/20), chunk_size):
#     chunk = df1.iloc[i:i+chunk_size]
#     result = Bad_Cause(chunk)
    
#     if result.sum().sum() > 0:
#         count += 1
        
# print(count)
# print((len(df1)/20)/60)

562
2449.7683333333334
