<a href="https://colab.research.google.com/github/rootofdata/Outlier_Detection/blob/main/ver1%26ver3_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised outlier detection for Time series data using LSTM + AE
Feature : TMP,HMD,CO,H2S,NO2,TVOC,CO2,NH3,PM10 (9 features)   
Time : 2022-3-23 ~

## 0. Setting

### 구글코랩 환경설정

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/Outlier Detection Paper'

/content/drive/MyDrive/Outlier Detection Paper


In [None]:
!ls

 영석						    'Main Paper Code.ipynb'
 data						    'Main Paper.gdoc'
'Environmental Modelling & Software_template.docx'   Reference
'Jun Hyeok2.ipynb'				     Untitled0.ipynb
'Jun Hyeok.ipynb'


### 필요한 라이브러리 호출

In [None]:
from glob import glob
import os

import pandas as pd
import numpy as np
import datetime
import time

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False   # 한글 폰트 패치.

import warnings
warnings.filterwarnings('ignore')   # 경고문 처리.

from sklearn.preprocessing import StandardScaler

## 1. Data Load

In [None]:
filst = sorted(glob('data/*.csv'))
filst

['data/1_2022-04-11 10_22_56.csv', 'data/3_2022-04-11 10_23_42.csv']

In [None]:
dataset_list = []
for file in filst:
  version = pd.read_csv(file)
  dataset_list.append(version)

version1 = dataset_list[0]
version3 = dataset_list[1]

df_ver1 = version1[['created','temperature','humidity','sgp30_tvoc','sgp30_co2','pm_mass_2_5']]
df_ver3 = version3[['time','ch2o','co']]

### Version3 정제

In [None]:
df_ver3['time'] = pd.to_datetime(df_ver3['time'])
df_ver3 = df_ver3.sort_values('time')
df_ver3['time'] = df_ver3['time'].dt.to_period(freq = 'min')
df_ver3

Unnamed: 0,time,ch2o,co
0,2022-04-04 10:24,45,42
1,2022-04-04 10:26,45,27
2,2022-04-04 10:28,45,34
3,2022-04-04 10:30,45,30
4,2022-04-04 10:32,45,27
...,...,...,...
5032,2022-04-11 10:14,65,46
5033,2022-04-11 10:16,66,28
5034,2022-04-11 10:18,67,33
5035,2022-04-11 10:20,68,39


In [None]:
time = df_ver3['time'].apply(lambda x: x.strftime('%Y%m%d%H%M'))

for i in range(len(df_ver3['time'])):
  if int(time[i][-2:])%2 ==1:
    df_ver3['time'][i]=df_ver3['time'][i]-datetime.timedelta(minutes=1)

In [None]:
ver3_frame = pd.date_range(start = '2022-04-04 10:24',            # 날짜 범위 시작
                         end = '2022-04-11 10:22',                # 날짜 범위 끝
                         freq = '2min',                           # 시간 간격( 2분 간격)
                         tz = 'Asia/Seoul')                       # 시간대(timezone)

ver3_frame = pd.DataFrame(ver3_frame, columns=['time'])
ver3_frame['time'] = ver3_frame['time'].dt.to_period(freq = 'min')
ver3_frame

Unnamed: 0,time
0,2022-04-04 10:24
1,2022-04-04 10:26
2,2022-04-04 10:28
3,2022-04-04 10:30
4,2022-04-04 10:32
...,...
5035,2022-04-11 10:14
5036,2022-04-11 10:16
5037,2022-04-11 10:18
5038,2022-04-11 10:20


In [None]:
new_df_ver3 = pd.merge(ver3_frame,df_ver3, how='outer',on='time')
new_df_ver3 = new_df_ver3.sort_values('time')
new_df_ver3

Unnamed: 0,time,ch2o,co
0,2022-04-04 10:24,45.0,42.0
1,2022-04-04 10:26,45.0,27.0
2,2022-04-04 10:28,45.0,34.0
3,2022-04-04 10:30,45.0,30.0
4,2022-04-04 10:32,45.0,27.0
...,...,...,...
5036,2022-04-11 10:14,65.0,46.0
5037,2022-04-11 10:16,66.0,28.0
5038,2022-04-11 10:18,67.0,33.0
5039,2022-04-11 10:20,68.0,39.0


In [None]:
new_df_ver3[new_df_ver3['co'].isna()]

Unnamed: 0,time,ch2o,co
1421,2022-04-06 09:46,,
1424,2022-04-06 09:50,,
3145,2022-04-08 19:12,,
4849,2022-04-11 04:00,,


In [None]:
new_df_ver3.interpolate(method="ffill", inplace=True)

In [None]:
new_df_ver3['co'].isna().sum()

0

In [None]:
# 최종 정제 후 버전3 데이터
new_df_ver3

Unnamed: 0,time,ch2o,co
0,2022-04-04 10:24,45.0,42.0
1,2022-04-04 10:26,45.0,27.0
2,2022-04-04 10:28,45.0,34.0
3,2022-04-04 10:30,45.0,30.0
4,2022-04-04 10:32,45.0,27.0
...,...,...,...
5036,2022-04-11 10:14,65.0,46.0
5037,2022-04-11 10:16,66.0,28.0
5038,2022-04-11 10:18,67.0,33.0
5039,2022-04-11 10:20,68.0,39.0


### Version1 정제

In [None]:
df_ver1.head()

Unnamed: 0,time,tmp,hmd,tvoc,co2,pm2.5
0,2022-04-11 10:15:02,27,32,533,400,5.3
1,2022-04-11 10:05:03,26,32,476,400,5.3
2,2022-04-11 09:55:04,26,32,448,400,6.8
3,2022-04-11 09:45:04,26,30,571,400,5.7
4,2022-04-11 09:35:05,26,30,572,400,5.4


In [None]:
df_ver1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    2483 non-null   object 
 1   tmp     2483 non-null   int64  
 2   hmd     2483 non-null   int64  
 3   tvoc    2483 non-null   int64  
 4   co2     2483 non-null   int64  
 5   pm2.5   2483 non-null   float64
dtypes: float64(1), int64(4), object(1)
memory usage: 116.5+ KB


In [None]:
df_ver1.columns=['time','tmp','hmd','tvoc','co2','pm2.5']
df_ver1['time'] = pd.to_datetime(df_ver1['time'])
df_ver1 = df_ver1.sort_values('time')
df_ver1['time'] = df_ver1['time'].dt.to_period(freq = 'min')
df_ver1.reset_index(drop=True, inplace=True)
df_ver1

Unnamed: 0,time,tmp,hmd,tvoc,co2,pm2.5
0,2022-03-23 18:57,24,25,0,400,5.6
1,2022-03-23 19:07,24,25,0,400,5.9
2,2022-03-23 19:37,24,25,0,400,6.2
3,2022-03-23 19:47,24,25,0,400,5.4
4,2022-03-23 19:57,23,25,0,400,5.1
...,...,...,...,...,...,...
2478,2022-04-11 09:35,26,30,572,400,5.4
2479,2022-04-11 09:45,26,30,571,400,5.7
2480,2022-04-11 09:55,26,32,448,400,6.8
2481,2022-04-11 10:05,26,32,476,400,5.3


In [None]:
print((df_ver1['tmp'] < 0).sum())
print((df_ver1['hmd'] < 0).sum())
print((df_ver1['tvoc'] < 0).sum())
print((df_ver1['co2'] < 0).sum())
print((df_ver1['pm2.5'] < 0).sum())

0
0
0
0
0


In [None]:
pd.to_datetime(df_ver1['time'][0]) > pd.datetime(2022,4,4)

SyntaxError: ignored

In [None]:
pd.datetime(2022,4,4,10,20,30, "%Y-%m")

TypeError: ignored

In [None]:
pd.to_datetime(str(df_ver1['time'][0]))

Timestamp('2022-03-23 18:57:00')