# Washing Machine

---

## 목적: 세탁기에서 생성된 Log Data 분석

## 데이터 셋 정보 (Metadata)

- **washing_machine.csv**: 세탁기에서 생성된 Log 데이터
- **wm_metadata.csv**: 각 컬럼 내용에 대한 설명

---

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

# Pandas option
#pd.set_option('max_rows', 30)
#pd.set_option('max_columns', 15)

# 시각화 스타일 설정
# plt.style.use('fivethirtyeight') 
# sns.set_style('whitegrid')

#import warnings
#warnings.filterwarnings('ignore')

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
rcParams['axes.grid'] = True

# 1. 데이터 탐색

### 1.1 CSV 파일 읽어오기

In [3]:
pd.read_csv('data/wm_metadata.csv')

Unnamed: 0,﻿column_name,description
0,category_code,카테고리 코드
1,model_name,모델명
2,regist_country,등록국가명
3,create_dt_utc,세탁종료시간
4,device_id,세탁기 Device 구분을 위한 고유ID(decoded)
5,event_type,세탁기 상태 구분
6,mon_data,인코딩된 세탁기 운전 데이터
7,State,세탁기의 현재 상태 정보
8,Remain_Time_H,세탁기의 현재 남은 시간 정보(Hour)
9,Remain_Time_M,세탁기의 현재 남은 분(Minute)


In [4]:
df = pd.read_csv('data/washing_machine.csv')
df

Unnamed: 0,﻿category_code,model_name,regist_country,create_dt_utc,device_id,event_type,mon_data,State,Remain_Time_H,Remain_Time_M,...,ChildLock,Steam,RemoteStart,TurboShot,InitialBit,PreState,SmartCourse,TCLCount,OPCourse,LoadLevel
0,201,LG_WM_KR,KR,16/11/2017 0:22,A1,WM_STATE,KAALARsBAAADAAAAAAAABAABqB4/AhUB,40,0,11,...,0,0,0,0,0,30,63,2,21,1
1,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,0,0,40,63,3,21,0
2,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_WASH_END,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,0,0,40,63,3,21,0
3,201,LG_WM_KR,KR,16/11/2017 2:48,A1,COMMON_WIFI_ON,HgALAAsBAAAAAAEAAAAABAAAADw/AxUF,30,0,11,...,0,0,0,0,0,60,63,3,21,5
4,201,LG_WM_KR,KR,16/11/2017 2:57,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAAAEB4/AxUA,60,0,0,...,0,0,0,0,0,30,63,3,21,0
5,201,LG_WM_KR,KR,16/11/2017 2:57,A1,WM_WASH_END,PAAAAAAAAAAAAAAAAAAABAAAEB4/AxUA,60,0,0,...,0,0,0,0,0,30,63,3,21,0
6,201,LG_WM_KR,KR,16/11/2017 2:58,A1,WM_WASH_BEGIN,KAA5ADkBAAAEAAAHAAAAAAAAEDw/AxUF,40,0,57,...,0,0,0,0,0,60,63,3,21,5
7,201,LG_WM_KR,KR,16/11/2017 2:58,A1,WM_STATE,KAA5ADkBAAAEAAAHAAAAAAAAEDw/AxUF,40,0,57,...,0,0,0,0,0,60,63,3,21,5
8,201,LG_WM_KR,KR,16/11/2017 3:26,A1,WM_STATE,MgAeADkBAAAAAAAHAAAABAAA5ig/AxUF,50,0,30,...,0,0,0,0,0,40,63,3,21,5
9,201,LG_WM_KR,KR,16/11/2017 3:57,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAADgTI/AxUA,60,0,0,...,0,0,0,0,0,50,63,3,21,0


### 1.2 데이터 살펴보기

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 32 columns):
﻿category_code    500 non-null int64
model_name        500 non-null object
regist_country    500 non-null object
create_dt_utc     500 non-null object
device_id         500 non-null object
event_type        500 non-null object
mon_data          500 non-null object
State             500 non-null int64
Remain_Time_H     500 non-null int64
Remain_Time_M     500 non-null int64
Initial_Time_H    500 non-null int64
Initial_Time_M    500 non-null int64
APCourse          500 non-null int64
Error             500 non-null int64
SoilLevel         500 non-null int64
SpinSpeed         500 non-null int64
WaterTemp         500 non-null int64
RinseCount        500 non-null int64
DryLevel          500 non-null int64
Reserve_Time_H    500 non-null int64
Reserve_Time_M    500 non-null int64
FreshCare         500 non-null int64
ChildLock         500 non-null int64
Steam             500 non-null int64
R

In [6]:
df.shape

(500, 32)

In [7]:
df.head()

Unnamed: 0,﻿category_code,model_name,regist_country,create_dt_utc,device_id,event_type,mon_data,State,Remain_Time_H,Remain_Time_M,...,ChildLock,Steam,RemoteStart,TurboShot,InitialBit,PreState,SmartCourse,TCLCount,OPCourse,LoadLevel
0,201,LG_WM_KR,KR,16/11/2017 0:22,A1,WM_STATE,KAALARsBAAADAAAAAAAABAABqB4/AhUB,40,0,11,...,0,0,0,0,0,30,63,2,21,1
1,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,0,0,40,63,3,21,0
2,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_WASH_END,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,0,0,40,63,3,21,0
3,201,LG_WM_KR,KR,16/11/2017 2:48,A1,COMMON_WIFI_ON,HgALAAsBAAAAAAEAAAAABAAAADw/AxUF,30,0,11,...,0,0,0,0,0,60,63,3,21,5
4,201,LG_WM_KR,KR,16/11/2017 2:57,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAAAEB4/AxUA,60,0,0,...,0,0,0,0,0,30,63,3,21,0


In [8]:
df.columns

Index(['﻿category_code', 'model_name', 'regist_country', 'create_dt_utc',
       'device_id', 'event_type', 'mon_data', 'State', 'Remain_Time_H',
       'Remain_Time_M', 'Initial_Time_H', 'Initial_Time_M', 'APCourse',
       'Error', 'SoilLevel', 'SpinSpeed', 'WaterTemp', 'RinseCount',
       'DryLevel', 'Reserve_Time_H', 'Reserve_Time_M', 'FreshCare',
       'ChildLock', 'Steam', 'RemoteStart', 'TurboShot', 'InitialBit',
       'PreState', 'SmartCourse', 'TCLCount', 'OPCourse', 'LoadLevel'],
      dtype='object')

### 1.3 임의 데이터 생성

> 분석을 위한 임의 변수들을 기존 데이터 프레임에 추가해보자.
>
> 1. 세제량 : 1 ~ 5 임의 생성
> 2. 세탁기 소음 : 40 ~ 70 임의 생성

In [9]:
new_df = pd.DataFrame({'Detergent' : np.random.randint(1, 6, size=len(df)),
                       'NoiseLevel': np.random.randint(40, 71, size=len(df))})
new_df.head()

Unnamed: 0,Detergent,NoiseLevel
0,1,67
1,1,51
2,2,65
3,4,70
4,3,66


### 1.4 DataFrame 결합

In [10]:
df = pd.concat([df, new_df], axis=1)
df

Unnamed: 0,﻿category_code,model_name,regist_country,create_dt_utc,device_id,event_type,mon_data,State,Remain_Time_H,Remain_Time_M,...,RemoteStart,TurboShot,InitialBit,PreState,SmartCourse,TCLCount,OPCourse,LoadLevel,Detergent,NoiseLevel
0,201,LG_WM_KR,KR,16/11/2017 0:22,A1,WM_STATE,KAALARsBAAADAAAAAAAABAABqB4/AhUB,40,0,11,...,0,0,0,30,63,2,21,1,1,67
1,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,40,63,3,21,0,1,51
2,201,LG_WM_KR,KR,16/11/2017 0:32,A1,WM_WASH_END,PAAAAAAAAAAAAAAAAAAABAABuSg/AxUA,60,0,0,...,0,0,0,40,63,3,21,0,2,65
3,201,LG_WM_KR,KR,16/11/2017 2:48,A1,COMMON_WIFI_ON,HgALAAsBAAAAAAEAAAAABAAAADw/AxUF,30,0,11,...,0,0,0,60,63,3,21,5,4,70
4,201,LG_WM_KR,KR,16/11/2017 2:57,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAAAEB4/AxUA,60,0,0,...,0,0,0,30,63,3,21,0,3,66
5,201,LG_WM_KR,KR,16/11/2017 2:57,A1,WM_WASH_END,PAAAAAAAAAAAAAAAAAAABAAAEB4/AxUA,60,0,0,...,0,0,0,30,63,3,21,0,4,67
6,201,LG_WM_KR,KR,16/11/2017 2:58,A1,WM_WASH_BEGIN,KAA5ADkBAAAEAAAHAAAAAAAAEDw/AxUF,40,0,57,...,0,0,0,60,63,3,21,5,4,67
7,201,LG_WM_KR,KR,16/11/2017 2:58,A1,WM_STATE,KAA5ADkBAAAEAAAHAAAAAAAAEDw/AxUF,40,0,57,...,0,0,0,60,63,3,21,5,2,55
8,201,LG_WM_KR,KR,16/11/2017 3:26,A1,WM_STATE,MgAeADkBAAAAAAAHAAAABAAA5ig/AxUF,50,0,30,...,0,0,0,40,63,3,21,5,2,46
9,201,LG_WM_KR,KR,16/11/2017 3:57,A1,WM_STATE,PAAAAAAAAAAAAAAAAAAABAADgTI/AxUA,60,0,0,...,0,0,0,50,63,3,21,0,2,68


In [11]:
df.columns

Index(['﻿category_code', 'model_name', 'regist_country', 'create_dt_utc',
       'device_id', 'event_type', 'mon_data', 'State', 'Remain_Time_H',
       'Remain_Time_M', 'Initial_Time_H', 'Initial_Time_M', 'APCourse',
       'Error', 'SoilLevel', 'SpinSpeed', 'WaterTemp', 'RinseCount',
       'DryLevel', 'Reserve_Time_H', 'Reserve_Time_M', 'FreshCare',
       'ChildLock', 'Steam', 'RemoteStart', 'TurboShot', 'InitialBit',
       'PreState', 'SmartCourse', 'TCLCount', 'OPCourse', 'LoadLevel',
       'Detergent', 'NoiseLevel'],
      dtype='object')

In [16]:
df.shape

(500, 34)

### 1.5 컬럼 삭제

In [17]:
df.drop(['category_code', 'model_name', 'regist_country'], axis=1, inplace=True)

ValueError: labels ['category_code'] not contained in axis

In [18]:
df.shape

(500, 34)

In [19]:
df.columns

Index(['﻿category_code', 'model_name', 'regist_country', 'create_dt_utc',
       'device_id', 'event_type', 'mon_data', 'State', 'Remain_Time_H',
       'Remain_Time_M', 'Initial_Time_H', 'Initial_Time_M', 'APCourse',
       'Error', 'SoilLevel', 'SpinSpeed', 'WaterTemp', 'RinseCount',
       'DryLevel', 'Reserve_Time_H', 'Reserve_Time_M', 'FreshCare',
       'ChildLock', 'Steam', 'RemoteStart', 'TurboShot', 'InitialBit',
       'PreState', 'SmartCourse', 'TCLCount', 'OPCourse', 'LoadLevel',
       'Detergent', 'NoiseLevel'],
      dtype='object')

> **Q. 'FreshCare', 'TurboShot', 'InitialBit' 컬럼 제거, DataFrame 확인**

In [20]:
df.drop(['FreshCare', 'TurboShot', 'InitialBit'], axis=1, inplace=True)

In [21]:
df.shape

(500, 31)

In [22]:
df.columns

Index(['﻿category_code', 'model_name', 'regist_country', 'create_dt_utc',
       'device_id', 'event_type', 'mon_data', 'State', 'Remain_Time_H',
       'Remain_Time_M', 'Initial_Time_H', 'Initial_Time_M', 'APCourse',
       'Error', 'SoilLevel', 'SpinSpeed', 'WaterTemp', 'RinseCount',
       'DryLevel', 'Reserve_Time_H', 'Reserve_Time_M', 'ChildLock', 'Steam',
       'RemoteStart', 'PreState', 'SmartCourse', 'TCLCount', 'OPCourse',
       'LoadLevel', 'Detergent', 'NoiseLevel'],
      dtype='object')

### 1.6 Grouping

- **로그 발생 유형 확인**

In [23]:
df['event_type']

0            WM_STATE
1            WM_STATE
2         WM_WASH_END
3      COMMON_WIFI_ON
4            WM_STATE
5         WM_WASH_END
6       WM_WASH_BEGIN
7            WM_STATE
8            WM_STATE
9            WM_STATE
10        WM_WASH_END
11     COMMON_WIFI_ON
12           WM_STATE
13           WM_STATE
14           WM_STATE
15           WM_STATE
16        WM_WASH_END
17     COMMON_WIFI_ON
18           WM_STATE
19           WM_STATE
20        WM_WASH_END
21           WM_STATE
22     COMMON_WIFI_ON
23           WM_STATE
24           WM_STATE
25           WM_STATE
26           WM_STATE
27        WM_WASH_END
28     COMMON_WIFI_ON
29      WM_WASH_BEGIN
            ...      
470     WM_WASH_BEGIN
471          WM_STATE
472          WM_STATE
473          WM_STATE
474          WM_STATE
475          WM_STATE
476       WM_WASH_END
477    COMMON_WIFI_ON
478          WM_STATE
479     WM_WASH_BEGIN
480          WM_PAUSE
481     WM_WASH_BEGIN
482          WM_STATE
483          WM_STATE
484       

In [24]:
df['event_type'].unique()

array(['WM_STATE', 'WM_WASH_END', 'COMMON_WIFI_ON', 'WM_WASH_BEGIN',
       'WM_PAUSE', 'WM_CONTINUE', 'WM_ERROR'], dtype=object)

In [None]:
df['event_type'].value_counts()

### 1.7 일시중지('WM_PAUSE') Log가 가장 많은 기계의 device_id 찾기

In [None]:
df[df['event_type'] == 'WM_PAUSE']

In [None]:
df[df['event_type'] == 'WM_PAUSE']['device_id']

> **Q. 일시중지(WM_PAUSE)인 device_id를 Grouping 해서 개수를 세어보자**

In [None]:
df[df['event_type'] == 'WM_PAUSE']['device_id'].value_counts()

> **Q. 어느 기계(device_id)에 가장 많은 에러('WM_ERROR') Log가 기록되어 있을까?**

In [None]:
df[df['event_type'] == 'WM_ERROR']['device_id'].value_counts()

### 1.8 Log가 많이 생성된 시간대

####  데이터 타입 확인: 'create_dt_utc'

In [None]:
df['create_dt_utc'].head()

In [None]:
df['create_dt_utc'][0]

In [None]:
type(df['create_dt_utc'][0])

#### 데이터 타입 변경: String --> Datetime

In [None]:
df['create_dt_utc'] = pd.to_datetime(df['create_dt_utc'])

In [None]:
df['create_dt_utc']

In [None]:
df['create_dt_utc'][0]

In [None]:
type(df['create_dt_utc'][0])

#### 4시간 단위로 Grouping

In [None]:
grouper = pd.Grouper(key='create_dt_utc', freq='4h')
grouper

In [None]:
gp_4h = df.groupby(grouper)

In [None]:
gp_4h.count()

In [None]:
freq = gp_4h['create_dt_utc'].count()
freq

In [None]:
freq.plot(kind='bar', figsize=(15, 7))

#### local time으로 변경: Series의 Index를 변경

In [None]:
freq.tz_localize('UTC').tz_convert('Asia/Seoul').plot(kind='barh', figsize=(15, 15))

#### local time으로 변경: DataFrame의 컬럼을 변경

In [None]:
import time
from datetime import timedelta

In [None]:
# 컴퓨터에 설정된 타임존과 UTC의 차이 (단위: 초) (UTC - Current Timezone)
time.timezone

In [None]:
df['create_dt_utc'] - timedelta(seconds=time.timezone)

> **Q. 한국 시각으로 변경한 데이터를 create_dt_kst 라는 새로운 컬럼에 넣어보자**

In [None]:
df['create_dt_kst'] =  df['create_dt_utc'] - timedelta(seconds=time.timezone)
df

> **Q. 한국 시각으로 변경된 데이터로 그래프 그리기**

In [None]:
grouper_kst = pd.Grouper(key='create_dt_kst', freq='4h')

In [None]:
freq_kst = df.groupby(grouper_kst)['create_dt_kst'].count()

In [None]:
freq_kst.plot(kind='barh', figsize=(15, 15))

In [None]:
# end of file