<a href="https://colab.research.google.com/github/seeedata/Dacon/blob/main/%EC%98%A8%EB%9D%BC%EC%9D%B8%20%EC%B1%84%EB%84%90%20%EC%A0%9C%ED%92%88%20%ED%8C%90%EB%A7%A4%EB%9F%89%20%EC%98%88%EC%B8%A1%20AI%20%EC%98%A8%EB%9D%BC%EC%9D%B8%20%ED%95%B4%EC%BB%A4%ED%86%A4/0812%20%EB%B8%8C%EB%9E%9C%EB%93%9C%20%EC%96%B8%EA%B8%89%EB%9F%89%20%EC%A0%84%EC%B2%B4%ED%8F%89%EA%B7%A0%20%EC%B6%94%EA%B0%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Version 1.3
- 전체데이터를 돌리는데 Loss Function을 MSE로 두고 이를 낮추는 방향으로 학습.

#### Dataset Info.

- train.csv [파일]
> - ID : 실제 판매되고 있는 고유 ID
> - 제품 : 제품 코드
> - 대분류 : 제품의 대분류 코드
> - 중분류 : 제품의 중분류 코드
> - 소분류 : 제품의 소분류 코드
> - 브랜드 : 제품의 브랜드 코드
> - **2022-01-01 ~ 2023-04-04 : 실제 일별 판매량**
>		 단, 제품이 동일하여도 판매되고 있는 고유 ID 별로 기재한 분류 정보가 상이할 수 있음. 즉, 고유 ID가 다르다면, 제품이 같더라도 다른 판매 채널


- sample_submission.csv [파일] - 제출 양식
> - ID : 실제 판매되고 있는 고유 ID
		 제출 시 ID Column에 해당하는 데이터에 반드시 zfill(5)를 적용할 필요 없음

> - **2023-04-05 ~ 2023-04-25 : 예측한 일별 판매량**


- sales.csv [파일] - 메타(Meta) 정보
> - ID : 실제 판매되고 있는 고유 ID
> - 제품 : 제품 코드
> - 대분류 : 제품의 대분류 코드
> - 중분류 : 제품의 중분류 코드
> - 소분류 : 제품의 소분류 코드
> - 브랜드 : 제품의 브랜드 코드
> - **2022-01-01 ~ 2023-04-04 : 실제 일별 총 판매금액**
>		 단, 제품이 동일하여도 판매되고 있는 고유 ID 별로 기재한 분류 정보가 상이할 수 있음. 즉, 고유 ID가 다르다면, 제품이 같더라도 다른 판매 채널


- brand_keyword_cnt.csv [파일] - 메타(Meta) 정보
> - 브랜드 : 브랜드 코드
> - 2022-01-01 ~ 2023-04-04 : 브랜드의 연관키워드 언급량을 정규화한 일별 데이터

- product_info.csv [파일] - 메타(Meta) 정보
> - 제품 : 제품 코드
> - 제품특성 : 제품 특성 데이터(Text)
>		 train.csv에 존재하는 모든 제품 코드가 포함되어 있지 않음. 또는 product_info.csv에 존재하는 제품 코드가 train.csv에 존재하지 않을 수 있음.


#### Dataset Load

In [None]:
### 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/LG_AImers_Hackerthon/AImers_data.zip

Archive:  /content/drive/MyDrive/LG_AImers_Hackerthon/AImers_data.zip
  inflating: brand_keyword_cnt.csv   
  inflating: product_info.csv        
  inflating: sales.csv               
  inflating: sample_submission.csv   
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np

brand_keyword = pd.read_csv('brand_keyword_cnt.csv') # 브랜드 일별 언급량 정규화
brand_keyword_scaled = pd.read_csv('/content/drive/MyDrive/LG_AImers_Hackerthon/brand_keyword_scaled.csv') # 일별 판매량 Min_Max_Scaled
product_info = pd.read_csv('product_info.csv')
sales = pd.read_csv('sales.csv') # 실제 일별 판매 금액
submission = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv') # 일별 판매량
train_scaled = pd.read_csv('/content/drive/MyDrive/LG_AImers_Hackerthon/train_scaled.csv') # 일별 판매량 Min_Max_Scaled

In [None]:
brand_mean = pd.concat([brand_keyword_scaled['브랜드'], brand_keyword_scaled.iloc[:, 1:].mean(axis = 1)], axis = 1)
brand_mean.columns = ['brand', 'brand_mean']

#### Keras GRU, LSTM
- 참고: https://www.kaggle.com/code/humamfauzi/multiple-stock-prediction-using-single-nn#Gated-Recurrent-Units

- 데이터 조정 참고: https://dacon.io/competitions/official/236129/codeshare/8668?page=1&dtype=recent

- Tensorflow Loss Function 참고: https://dacon.io/en/codeshare/4444

In [None]:
#prod_type = [int(k.split('-')[1]) for k in product_info['제품']]
#len(np.unique(prod_type))

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':60, # 90일치로 학습 submission 12 : 60일치
    'PREDICT_SIZE':21, # 21일치 예측
    'LEARNING_RATE':5e-4, ### submission 12 lr: 3e-4
    'SEED':9083 ### submission 12 seed: 9083
}

In [None]:
### Seed 고정 후 진행
import random
import os
import tensorflow as tf

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
# Train_data 상 3168개 제품 정보 3168개 존재 및 전처리 진행
#prod_type = [int(k.split('-')[1]) for k in train_scaled['제품']]
#train_scaled['제품'] = prod_type
#print(len(train_scaled['제품'].unique()))

In [None]:
### 중분류로 구분된 set으로 각각 변수 생성 & label Encoding
from sklearn.preprocessing import LabelEncoder

### train 전체에 대해서 -> 소분류 Label Encoding 해놓고 분리해야 함.
label_encoder = LabelEncoder()
categorical_columns = ['소분류']

for col in categorical_columns:
    label_encoder.fit(train_scaled[col])
    train_scaled[col] = label_encoder.transform(train_scaled[col])

### 추가 처리
tmp_br = pd.merge(train_scaled[['ID', '브랜드']], brand_mean, left_on = '브랜드', right_on = 'brand')
train_scaled.insert(loc = 5, column = 'brand_mean', value = tmp_br['brand_mean'] )
train_scaled = train_scaled.drop(['브랜드'], axis = 1)

### NA값은 전체 평균으로 넣음 (fillna)
train_scaled['brand_mean'] = train_scaled['brand_mean'].fillna(train_scaled['brand_mean'].mean())

train_scaled.drop(['제품','중분류', '대분류'], axis = 1)

#cat = sorted(train_scaled['중분류'].unique())

#for k in range(len(cat)):
#    globals()['set_'+str(k)] = train_scaled[train_scaled['중분류'] == cat[k]].drop(['중분류', '대분류'], axis = 1)

Unnamed: 0,ID,소분류,brand_mean,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,37,0.177865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
1,1,43,0.198931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.333333,0.222222,0.00000,0.00000,0.222222,0.000000
2,2,43,0.198931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
3,3,43,0.198931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
4,4,2,0.013591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,41,0.306625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15886,15886,43,0.306625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.024390,0.000000,0.016260,0.03252,0.00813,0.008130,0.024390
15887,15887,43,0.306625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000
15888,15888,43,0.306625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.142857


In [None]:
### 7, 8, 10, 11번 합쳐서 다시 진행
#set_12 = pd.concat([set_6, set_7, set_9, set_10])

In [None]:
from tqdm.auto import tqdm

### RAM 폭파 대비
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    STEP_SIZE = 2 # 이 값을 본인의 환경에 맞게 조정

    num_rows = len(data)
    window_size = train_size + predict_size
    adjusted_size = (len(data.columns) - window_size + 1) // STEP_SIZE

    input_data = np.empty((num_rows * adjusted_size, train_size, len(data.iloc[0, :2]) + 1))
    target_data = np.empty((num_rows * adjusted_size, predict_size))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :2])
        sales_data = np.array(data.iloc[i, 2:])

        for j in range(0, len(sales_data) - window_size + 1, STEP_SIZE):
            window = sales_data[j: j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * adjusted_size + j // STEP_SIZE] = temp_data
            target_data[i * adjusted_size + j // STEP_SIZE] = window[train_size:]

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :2]) + 1))

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :2])
        sales_data = np.array(data.iloc[i, -train_size:])

        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data

    return input_data

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, LSTM
from tensorflow.keras.activations import linear
from tensorflow.keras.optimizers import Adam

def GRU_Model(output_window, inputs):
   opt = tf.keras.optimizers.legacy.Adam(learning_rate = CFG['LEARNING_RATE'])
   model = Sequential()
   model.add(GRU(64, return_sequences = False, input_shape=(inputs.shape[1],3)))
   model.add(Dense(32, activation = linear))
   model.add(Dropout(0.2))
   model.add(Dense(output_window, activation = linear))
   model.compile(optimizer = opt, loss='mean_squared_error')
   model.summary()
   return model

#### 위 과정을 한 번에 처리할 수 있는 함수 만들기



In [None]:
import gc

def full_inference(data, output_window = 21, output_file_name = 'set', batch_size = 1024):
    train_input, train_target = make_train_data(data.drop(['ID'], axis = 1))
    test_input = make_predict_data(data.drop(['ID'], axis = 1))

    # Train / Validation Split
    data_len = len(train_input)
    val_s_input = train_input[-int(data_len*0.2):]
    val_s_target = train_target[-int(data_len*0.2):]
    train_s_input = train_input[:-int(data_len*0.2)]
    train_s_target = train_target[:-int(data_len*0.2)]

    # Setting Model
    gru_model = GRU_Model(output_window = output_window, inputs = train_input)
    gru_model.fit(train_s_input, train_s_target, validation_data = (val_s_input, val_s_target), epochs = 10, batch_size = 1024)

    # Prediction
    pred = gru_model.predict(test_input)
    pd.DataFrame(pred).to_csv('/content/drive/MyDrive/LG_AImers_Hackerthon/' + output_file_name + '_pred.csv')
    gru_model.save('/content/drive/MyDrive/LG_AImers_Hackerthon/ '+ output_file_name + '_model.h5')

    ### RAM 관리? 를 위해 Input_삭제
    del train_input, train_target, test_input, val_s_input, val_s_target, train_s_input, train_s_target
    gc.collect()

In [None]:
full_inference(data = train_scaled.drop(['제품','중분류', '대분류'], axis = 1), output_window = 21, output_file_name = 'all_set')

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 64)                13248     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 21)                693       
                                                                 
Total params: 16,021
Trainable params: 16,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Inverse Scaling

In [None]:
import pickle

with open("/content/drive/MyDrive/LG_AImers_Hackerthon/scale_max_dict.pkl", 'rb') as file:
    scale_max_dict = pickle.load(file)

with open("/content/drive/MyDrive/LG_AImers_Hackerthon/scale_min_dict.pkl", 'rb') as file:
    scale_min_dict = pickle.load(file)

In [None]:
result = pd.read_csv('/content/drive/MyDrive/LG_AImers_Hackerthon/all_set_pred.csv')
result.columns = submission.columns

In [None]:
result

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,-0.012654,-0.004525,0.009050,0.014553,0.017212,0.016130,0.017245,0.014998,0.016490,...,0.022452,0.023703,0.021869,0.019815,0.019372,0.020514,0.023977,0.025900,0.026357,0.026306
1,1,0.039265,0.057517,0.080489,0.084257,0.085346,0.085753,0.085901,0.088180,0.089113,...,0.085927,0.086120,0.087116,0.088148,0.088818,0.088804,0.087309,0.087094,0.086825,0.087317
2,2,-0.009164,-0.001212,0.011500,0.016783,0.019675,0.018383,0.018825,0.016204,0.017036,...,0.023338,0.024748,0.022791,0.019934,0.019462,0.020217,0.023481,0.025721,0.026897,0.026274
3,3,-0.009164,-0.001212,0.011500,0.016783,0.019675,0.018383,0.018825,0.016204,0.017036,...,0.023338,0.024748,0.022791,0.019934,0.019462,0.020217,0.023481,0.025721,0.026897,0.026274
4,4,0.002699,0.005098,0.008242,0.009621,0.010560,0.009322,0.012659,0.011657,0.013410,...,0.015840,0.017356,0.017189,0.016695,0.016566,0.015334,0.016145,0.017325,0.018835,0.020747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,-0.005605,0.000318,0.011638,0.016738,0.019420,0.018889,0.019641,0.016749,0.017260,...,0.022911,0.024802,0.022898,0.019810,0.018791,0.019205,0.022352,0.024610,0.025713,0.025413
15886,15886,0.019833,0.023631,0.031067,0.035223,0.037521,0.037332,0.036933,0.033681,0.033064,...,0.038467,0.040036,0.038102,0.034453,0.032826,0.033332,0.036097,0.038761,0.039674,0.038919
15887,15887,-0.008963,-0.000904,0.011881,0.017210,0.020124,0.018906,0.019338,0.016770,0.017555,...,0.023781,0.025233,0.023368,0.020513,0.019984,0.020671,0.023901,0.026178,0.027428,0.026888
15888,15888,0.090300,0.078340,0.068658,0.066020,0.066656,0.066738,0.067806,0.065183,0.062910,...,0.063831,0.064871,0.065150,0.063580,0.061881,0.060242,0.060561,0.061921,0.063191,0.063824


In [None]:
# inverse scaling
pred = pd.DataFrame(result)
for idx in range(len(pred)):
    pred.iloc[idx, 1:] = pred.iloc[idx, 1:] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

round_pred = np.round(pred, 0).astype(int)

In [None]:
round_pred = round_pred.clip(lower = 0)

In [None]:
round_pred

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,2,0,0,0,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,3,0,0,1,1,2,1,1,1,1,...,2,2,2,2,2,2,2,2,2,2
4,4,0,0,0,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0,0,2,3,3,3,3,3,3,...,4,4,4,3,3,3,3,4,4,4
15886,15886,2,3,4,4,5,5,5,4,4,...,5,5,5,4,4,4,4,5,5,5
15887,15887,0,0,0,0,1,0,1,0,0,...,1,1,1,1,1,1,1,1,1,1
15888,15888,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
round_pred.to_csv('submission14.csv', index = False)