In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings

import argparse
import logging
import pandas as pd
import random
import os
import numpy as np
from loggers import logger

from process import input_data, preprocess

def set_logger(log_name):
    log_obj = logger.AutoMLLog(log_name)
    log_obj.set_handler('automl_process')
    log_obj.set_formats()
    auto_logger = log_obj.addOn()
    
    auto_logger.info('logger 세팅')
    
log_name = 'practice'
set_logger(log_name)

logger 세팅


# **1. 데이터 불러오기**

In [3]:
import pandas as pd
import logging
import numpy as np

class Data_load:
    def __init__(self, path, log_name):
        self.path = path #데이터 위치 경로 입력
        self.logger = logging.getLogger(log_name)

    def read_data(self):

        self.logger.info('csv 데이터 불러오기')
        self.logger.info(f'{self.path}')
        
        
        #코드 수정해야 함 
        try:
            df = pd.read_csv(self.path)
        except:
            try:
                df = pd.read_csv(self.path, encoding='cp949')
            except:
                try:
                    df = pd.read_csv(self.path, encoding='utf-8')
                except:
                    self.logger.info('데이터 포맷을 맞춰주세요')
        
        self.logger.info('변수 분리 시작')
        try:
            var_list = df.columns.tolist() #전체 변수리스트 추출
            num_var = df.select_dtypes(include='float').columns.tolist() + df.select_dtypes(include='int').columns.tolist() #수치형 변수 추출
            obj_var = [x for x in df.columns if x not in num_var] #문자형 변수 추출
        
        except: 
            self.logger.error('csv 데이터 불러오기를 실패했습니다')
        
        df = self.reduce_mem_usage(df)
        
        return df, var_list, num_var, obj_var
    
    #데이터 메모리 줄이기
    def reduce_mem_usage(self, df):
        """ 
        iterate through all the columns of a dataframe and 
        modify the data type to reduce memory usage.        
        """
        start_mem = df.memory_usage().sum() / 1024**2
        self.logger.info(f'데이터 구성: {df.shape[0]} 행, {df.shape[1]}열')
        self.logger.info(f'Memory usage of dataframe is {start_mem:.2f}MB')
    
        for col in df.columns:
            col_type = df[col].dtype
        
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max <\
                    np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max <\
                    np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max <\
                    np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max <\
                    np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                elif str(col_type)[:5] == 'float':
                    if c_min > np.finfo(np.float16).min and c_max <\
                    np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max <\
                    np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
                else:
                    pass
            else:
                df[col] = df[col].astype('category')
        end_mem = df.memory_usage().sum() / 1024**2
        self.logger.info(f'Memory usage after optimization is: {end_mem:.2f}MB')
        self.logger.info(f'Decreased by {100*((start_mem - end_mem)/start_mem):.1f}%')
    
        return df

In [4]:
data, var_list, num_var, obj_var = input_data.Data_load('storage/demand_forecast_dataset.csv', log_name).read_data()

csv 데이터 불러오기
storage/demand_forecast_dataset.csv
변수 분리 시작
데이터 구성: 115050 행, 39열
Memory usage of dataframe is 34.23MB
Memory usage after optimization is: 9.46MB
Decreased by 72.4%


In [5]:
data.head(10)

Unnamed: 0,sale_dy,str_cd,str_nm,l1_cd,l1_nm,prod_cd,prod_nm,sale_qty,dc_stk_qty,dyoff_dy,...,outsd_forn_visit,tot_visit,ride_pasgr_num,alight_pasgr_num,tot_pop,male_pop,female_pop,search_cnt,click_cnt,disa_subs_dur
0,2019-01-01,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,10,0,0,...,118614,172832,0,0,0,0,0,26.46875,29,0
1,2019-01-08,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,5,0,0,...,38373,97771,0,0,0,0,0,38.25,41,0
2,2019-01-15,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,31,0,0,...,42375,100251,0,0,0,0,0,29.40625,35,0
3,2019-01-22,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,22,0,0,...,41744,99396,0,0,0,0,0,23.53125,42,0
4,2019-01-29,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,3,0,0,...,38386,97200,0,0,0,0,0,32.34375,34,0
5,2019-02-05,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,18,0,0,...,107955,156920,0,0,0,0,0,0.0,19,0
6,2019-02-12,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,12,0,0,...,37786,97009,0,0,0,0,0,17.640625,44,0
7,2019-02-19,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,9,0,0,...,29970,80321,0,0,0,0,0,26.46875,48,0
8,2019-02-26,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,24,0,0,...,49538,109227,0,0,0,0,0,35.28125,35,0
9,2019-03-05,110438,G속초조양,106,가공채소/계란,46145000,CJ)양념이잘배는찌개두부300G,10,0,0,...,30658,90606,0,0,0,0,0,35.28125,35,0


# **2. 데이터 전처리 하기**

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import joblib
import json
import glob
import logging
from collections import defaultdict
import pickle

#전처리 class
class Preprocessing:
    def __init__(self, log_name, data, var_list, num_var, obj_var, target_var, date_var, store_list, unit, anomaly_per=10):
        self.df = data                                     # 데이터
        self.var_list = var_list                           # 전체 변수 리스트
        self.num_var = num_var                             # 수치형 변수 리스트
        self.obj_var = obj_var                             # 문자형 변수 리스트
        self.target_var = target_var                       # 타겟 변수
        self.date_var = date_var                           # 시간 변수
        self.store_list = store_list                       # 지점(상품 변수)
        self.unit = unit                                   # 시간 단위
        
        self._anomaly_ratio = int(anomaly_per)             # 지정 결측 범위
        self._anomaly_percentage = int(anomaly_per) / 100  # 지정 결측 범위
        
        self.logger = logging.getLogger(log_name)
        
        #결측치 처리 먼저 진행
        self.df = self.na_preprocess(self.df, self._anomaly_ratio)
        
        self.df, self.tds_df = self.tds_preprocess(self.df, self.target_var, self.date_var, self.store_list)
        
        # 표준화
        self.df = self.standardize(self.df, self.num_var)
        
        # 라벨 인코딩
        self.df = self.label_encoder(self.df, self.obj_var)
        
        # 일반 전처리 완료
        self.df = self.get_df()
        
        # 시계열 전처리 수행
        self.df = self.ts_preprocess(self.df, self.target_var, self.date_var, self.store_list, self.unit)
    
    
        # 결측치 확인 및 처리
    def na_preprocess(self, df, anomaly_per):
        
        self.logger.info('결측치 처리')
        
        try:
            #Column별 결측치 n% 이상 있을 경우 제외
            remove_v1 = round(df.isnull().sum() / len(df)*100, 2)
            tmp_df = df[remove_v1[remove_v1 < anomaly_per].index]
        
            #Row별 결측치 n% 이상 있을 경우 제외
            idx1 = len(tmp_df.columns) * 0.7
        
        except:
            self.logger.exception('결측치 처리에 문제가 발생하였습니다')
        
        self.logger.info(f'결측치 처리 이후 데이터 구성: {df.shape[0]} 행, {df.shape[1]}열')                  
        
        return tmp_df.dropna(thresh=idx1, axis=0)
    
  
    def tds_preprocess(self, df, target_var, date_var, store_list):
        #target, date, store
        self.logger.info('전처리를 위한 target, date, store 분리')
        
        #식별 변수가 있을 수도 있고 없을 수도 있다(0119)
        
        try:
            tds_df = df[[target_var, date_var]+ store_list]
            df = df.drop([target_var, date_var]+ store_list, axis=1)
            
            if date_var in self.num_var:
                self.num_var.remove(date_var)    
            else : self.obj_var.remove(date_var)
            
            for store_var in store_list:
                if store_var in self.num_var:
                    self.num_var.remove(store_var)    
                else : self.obj_var.remove(store_var)

            if target_var in self.num_var:
                self.num_var.remove(target_var)    
            else : self.obj_var.remove(target_var)
            
            
        except:
            self.logger.exception(' target, date, store 분리 처리에 문제가 발생하였습니다')
            
        return df, tds_df
        
    
#     # 이상치 제거 절차 삭제(230119)
        
    
    #정규화
    def standardize(self, df, num_var):
                                  
        self.logger.info('정규화 진행')
        try:        
            if num_var:
                num_data = df.loc[:, num_var]
                non_num_data = df.drop(set(num_var), axis=1)

                #표준화
                std_scaler = StandardScaler()
                fitted = std_scaler.fit(num_data)
                output = std_scaler.transform(num_data)
                num_data = pd.DataFrame(output, columns = num_data.columns, index=list(num_data.index.values))

                tmp_df = pd.concat([non_num_data, num_data], axis=1)
            else:
                tmp_df = df
        except:
            self.logger.exception('정규화 진행 중에 문제가 발생하였습니다')                                      
                                  
        return tmp_df
        
    
    #문자형 변수를 수치형으로 변환
    def label_encoder(self, df, obj_var):
                                  
        self.logger.info('라벨 인코딩 진행')
        try:                              
            if obj_var:
                obj_data = df.loc[:, obj_var]
                non_obj_data = df.drop(set(obj_var), axis=1)

                #인코딩
                lbl_en = LabelEncoder()
                lbl_en = defaultdict(LabelEncoder)
                obj_data = obj_data.apply(lambda x:lbl_en[x.name].fit_transform(x))
            
                #라벨 인코딩 저장    
                pickle.dump(lbl_en, open('storage/label_encoder.sav', 'wb'))
                
            
                tmp_df = pd.concat([obj_data, non_obj_data], axis=1)
                
            else:
                tmp_df = df
                                  
        except:
            self.logger.exception('수치형 변환 중에 문제가 발생하였습니다')                                      
                                 
        return tmp_df
    
    
    def get_df(self):
        
        self.df = pd.concat([self.df, self.tds_df], axis=1)

        self.logger.info('전처리 완료')
        self.logger.info('\n')
        self.logger.info(self.df.head())
        
        
        return self.df

    
    def ts_preprocess(self, data, target_var, date_var, store_list, unit):
        
        self.logger.info('시계열용 전처리 진행')
        try:    
            #store_var type이 str이어야 함
            for store_var in store_list:
                data[store_var] = data[store_var].astype(str)
            
            try:
                data[date_var] = pd.to_datetime(data[date_var],infer_datetime_format = True, utc = True).astype('datetime64[ns]')
            
            except:
                self.logger.info('날짜 변수를 확인해주세요')
            #except:
            #    try:
            #        data[date_var] = data[date_var].apply(lambda x : datetime.strptime(str(x), '%Y%m%d'))
            #    except:
            #        self.logger.info('날짜 변수를 확인해주세요')
            # => 날짜 양식에 따라 계속 확장해 나가야 함
            
            #store_list가 하나일 때
            if len(store_list) == 1 :
                store_list = ['dummy'] + store_list
                data['dummy'] = 'dummy'
                
            df = pd.DataFrame()
            for store_var_0, store_var_1 in data.drop_duplicates(store_list)[store_list].values:
                tmp_df = data.loc[(data[store_list[0]]==store_var_0)&(data[store_list[1]]==store_var_1), :]
                tmp_df = tmp_df.sort_values(date_var).reset_index(drop=True)
                tmp_df['time_idx'] = tmp_df.index
                df = pd.concat([df, tmp_df], axis=0)
            df.reset_index(drop=True, inplace=True)
                
            # add additional features
            if unit == 'day':
                df[unit] = df[date_var].dt.day.astype(str).astype("category")
            elif unit == 'week':
                df[unit] = df[date_var].dt.isocalendar().week.astype(str).astype("category")  # categories have be strings
            elif unit == 'month':
                df[unit] = df[date_var].dt.month.astype(str).astype("category")  # categories have be strings
            
            self.logger.info('시계열 전처리 후 df')
            self.logger.info(df.head())
        except:
            self.logger.exception('시계열용 전처리 진행 중에 문제가 발생하였습니다')     
        
        
        return df

In [7]:
df = preprocess.Preprocessing(log_name, data, var_list, num_var, obj_var, target_var='sale_qty', date_var= 'sale_dy', store_list=['str_cd','prod_cd'], unit='day').df

결측치 처리
결측치 처리 이후 데이터 구성: 115050 행, 39열
전처리를 위한 target, date, store 분리
정규화 진행
라벨 인코딩 진행
전처리 완료


   str_nm  l1_nm  prod_nm  evt_dur_div  avg_tmpr_val  search_cnt     l1_cd  \
0       0      0        0            4     -1.184225    0.228434 -1.160689   
1       0      0        0            4     -1.001111    0.897404 -1.160689   
2       0      0        0            4     -1.001111    0.395233 -1.160689   
3       0      0        0            4     -0.783086    0.061635 -1.160689   
4       0      0        0            4     -0.896505    0.562032 -1.160689   

   dc_stk_qty  dyoff_dy  holidy_cd  ...  alight_pasgr_num   tot_pop  male_pop  \
0   -0.165029       0.0   2.612763  ...         -1.249877 -1.789319  -1.74308   
1   -0.165029       0.0  -0.382737  ...         -1.249877 -1.789319  -1.74308   
2   -0.165029       0.0  -0.382737  ...         -1.249877 -1.789319  -1.74308   
3   -0.165029       0.0  -0.382737  ...         -1.249877 -1.789319  -1.74308   
4   -0.165029       0.0  -0.38

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import joblib
import json
import glob
import logging
from datetime import timedelta

#prophet
from prophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
import optuna
from optuna.samplers import TPESampler

#from . import hpo


class Modeling:
    
    def __init__(self, log_name, df, target_var, date_var, store_list, unit, predict_n, HPO):
        self.df = df                                           # 데이터
        self.target_var = target_var                           # 타겟 변수
        self.date_var = date_var                               # 시간 변수
        self.store_list = store_list                           # 지점(상품 변수)
        self.unit = unit                                       # 시간 단위
        self.predict_n = predict_n                             # 예측 기간
        self.HPO = HPO                                         # 하이퍼파라미터 튜닝 여부
        
        
        self.logger = logging.getLogger(log_name)
       
        
        self.val_df = self.fb_fit_predict(self.df, self.target_var, self.date_var, self.store_list, self.unit, self.predict_n, self.HPO)
            
    def fb_fit_predict(self, df, target_var, date_var, store_list, unit, predict_n, HPO):
        
        self.logger.info('fbprophet 데이터 준비')
        print(store_list)
        
        if len(store_list) == 1 :
            store_list = ['dummy'] + store_list
            df.loc[:, 'dummy'] = 'dummy'
        
        train_df = pd.DataFrame()
        val_df = pd.DataFrame()
        pred_df = pd.DataFrame()

        for store_var_0, store_var_1 in df.drop_duplicates(store_list)[store_list].values:
            fb_df = df.loc[(df[store_list[0]]==store_var_0)&(df[store_list[1]]==store_var_1), :]               
            fb_df.loc[:, 'ds'] = fb_df[date_var]
            fb_df.loc[:, 'y'] = fb_df[target_var]        
            fb_df.loc[:, 'cap'] = np.max(fb_df[target_var].values)
            fb_df.loc[:, 'floor'] = np.min(fb_df[target_var].values)

            predict_size = predict_n
            fb_train = fb_df.iloc[:-predict_size, :]
            fb_var = fb_df.iloc[-predict_size:, :]

            print(HPO)
            if HPO :
                self.logger.info('fb HPO 진행') 
                parameters = hpo.HyperOptimization(train = fb_train, valid = fb_var, model = 'fb').best_params
                self.logger.info(f'fb HPO 진행 후 parameters: {parameters}')

            else:
                parameters = {'changepoint_prior_scale': 1.8, 'changepoint_range': 0.8, 'seasonality_prior_scale': 7.3, 'holidays_prior_scale': 6, 'seasonality_mode': 'multiplicative', 'weekly_seasonality': 5, 'yearly_seasonality': 18}

            #validate 후 validate_df 생성
            m = Prophet(**parameters)
            m.fit(fb_train[['y','ds','cap','floor']], algorithm='Newton')
            val_preds = m.predict(fb_var[['ds','cap','floor']])
            val_preds = val_preds[['ds','yhat']]
            val_real = fb_var[['y', date_var]]
            val_preds_df = pd.merge(val_preds, val_real, left_on='ds', right_on=date_var, how='inner')
            
            #train
            train = fb_train[['y','ds']]
            train[store_list[0]] = store_var_0
            train[store_list[1]] = store_var_1
            train_df = pd.concat([train_df, train], axis=0)
            
            #valid
            val_preds_df[store_list[0]] = store_var_0
            val_preds_df[store_list[1]] = store_var_1
            
            val_df = pd.concat([val_df, val_preds_df], axis=0) 
            #predicat_date 생성 후 예측 predict_df생성

            #m.fit(fb_df[['ds','cap','floor']])
            last_date = fb_df[date_var].iloc[-1:].tolist()[0]
            
            if unit =='day':
                predict_date = [last_date + timedelta(days=i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
            elif unit == 'week':
                predict_date = [last_date + timedelta(days=7*i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
            elif unit == 'month':
                predict_date = [last_date + timedelta(days=30*i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
            test_df = pd.DataFrame({'ds': predict_date})
            test_df['cap'] = fb_df['cap'].values[0]
            test_df['floor'] = fb_df['floor'].values[0]

            preds = m.predict(test_df[['ds','cap','floor']])
            preds = preds[['ds','yhat']]
            preds[store_list[0]] = store_var_0
            preds[store_list[1]] = store_var_1
            pred_df = pd.concat([pred_df, preds], axis=0)

        train_df.to_csv('train_df.csv', index=False)
        val_df.to_csv('val_df.csv', index=False)
        pred_df.to_csv('pred_df.csv', index=False)

        return val_df