# import

In [1]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Hyperparameters setting

In [3]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21,      # 21일치 예측
    'EPOCHS':20,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

# Data Preprocessing And Normalization

In [5]:
train_data = pd.read_csv('./open/train.csv').drop(columns=['ID', '제품'])

In [6]:
# train_data 정규화
numeric_cols = train_data.columns[4:]

# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)


# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1

# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

# 추후 추론 결과를 inverse scaling 하기위해 max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [9]:
sales_data = pd.read_csv("./open/sales.csv").drop(columns=['ID', '제품', '대분류', '중분류', '소분류', '브랜드'])

In [10]:
# sales_data 정규화
numeric_cols = sales_data.columns

# 칵 column의 min 및 max 계산
min_values = sales_data[numeric_cols].min(axis=1)
max_values = sales_data[numeric_cols].max(axis=1)

# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1

# min-max scaling 수행
sales_data[numeric_cols] = (sales_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)

In [12]:
brand_keyword = pd.read_csv("./open/brand_keyword_cnt.csv")

In [13]:
# 결측치 0으로 대체
brand_keyword.fillna(0, inplace=True)

# traindata의 n 번째 브랜드에 브랜드 언급량을 추가하기 위해 brand 선언(추후 데이터를 합칠때를 대비해 numpy로 선언)
brand = np.zeros((len(train_data), len(brand_keyword.iloc[1, 1:])))

for i in range(len(train_data)):
    idx = brand_keyword['브랜드']==train_data.loc[i, '브랜드']
    brand[i]=brand_keyword[idx].drop(columns='브랜드')
    
    # min-max scaling 수행
    max_val = max(brand[i])
    min_val = min(brand[i])
    
    if max_val == min_val:
        brand[i] = brand[i, :]*0
    else:
        brand[i] = (brand[i, :]-min_val)/(max_val - min_val)
    

In [44]:
# train_data는 대분류, 중분류, 소분류, 브랜드 데이터가 포함되어있음
train_data.shape, sales_data.shape, brand.shape 

((15890, 463), (15890, 459), (15890, 459))

In [33]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

# train_data의 제품코드를 숫자로 변환
for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

# train_data(일별 판매량)과
# sales_data(일별 총 판매금액)과
# brand(브랜드 언급량) 합치기

In [34]:
def make_train_data(data, sales, brand, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    column에 존재하는 날짜를 row로 Transpose
    train_data, sales_data, brand 데이터 결합
    학습 데이터와 예측 결과 데이터를 생성
    
    학습 데이터는 1개의 row에 모든 데이터에
    대분류, 중분류, 소분류, 브랜드, 일별 판매량, 판매금액, 브랜드 언급량이 포함된다.
    
    예측데이터는 일별 판매량 데이터만 포함되어있다.
    
    '''
    num_rows = len(data)
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), len(data.iloc[0, :4]) + 3, train_size))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        total_sales = np.array(sales.iloc[i])
        brand_data = brand[i]
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size] 
            input_data[i * (len(data.columns) - window_size + 1) + j][:4] = np.tile(np.array([encode_info]).T, (1, train_size))
            input_data[i * (len(data.columns) - window_size + 1) + j][4]= brand_data[j :j + train_size]
            input_data[i * (len(data.columns) - window_size + 1) + j][5] = window[:train_size]
            input_data[i * (len(data.columns) - window_size + 1) + j][6] = total_sales[j : j+train_size]
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [35]:
def make_predict_data(data, sales, brand, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 데이터를 생성
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, len(data.iloc[0, :4]) + 3, train_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        brand_data = brand[i][-train_size:]
        total_sales = np.array(sales.iloc[i][-train_size:])
        window = sales_data[-train_size : ]
        
        input_data[i][:4]=np.tile(np.array([encode_info]).T, (1, train_size))
        input_data[i][4]=brand_data[:train_size]
        input_data[i][5]=window[:train_size]
        input_data[i][6] = total_sales[:train_size]
    
    return input_data

In [36]:
train_input, train_target = make_train_data(train_data, sales_data, brand)
test_input = make_predict_data(train_data, sales_data, brand)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [37]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [38]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((4487336, 7, 90),
 (4487336, 21),
 (1121834, 7, 90),
 (1121834, 21),
 (15890, 7, 90))

# 학습 준비

In [45]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [46]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

# 모델 선언

In [47]:
class BaseModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(BaseModel, self).__init__()
        self.hidden_size = hidden_size
        
        # 7개의 CNN과 이 결과를 다시 3개로 만들어주는 CNN, 그리고 마지막 3개의 데이터를 1개로 묶어주는 CNN 
        self.c1 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c2 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c3 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c4 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c5 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c6 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.c7 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=5, stride=1)
        self.combination1 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=3, stride=1)
        self.combination2 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=3, stride=1)
        self.combination3 = nn.Conv1d(in_channels=7, out_channels=1, kernel_size=3, stride=1)
        self.last = nn.Conv1d(in_channels=3, out_channels=1, kernel_size=3, stride=1)
        
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )
            
        self.actv = nn.ReLU()
    
    def forward(self, x):
        # x shape: (Batch_size, 7, TRAIN_WINDOW_SIZE)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)
        
        #Convolution Layer
        conv1 = self.c1(x)
        conv2 = self.c2(x)
        conv3 = self.c3(x)
        conv4 = self.c4(x)
        conv5 = self.c5(x)
        conv6 = self.c6(x)
        conv7 = self.c7(x)
        
        combined_output = torch.cat([conv1, conv2, conv3, conv4, conv5, conv6, conv7], dim=1)
        
        comb1= self.combination1(combined_output)
        comb2= self.combination2(combined_output)
        comb3= self.combination3(combined_output)
        
        last_output = torch.cat([comb1, comb2, comb3], dim=1)
        
        x = self.last(last_output)
        
        
        # LSTM layer
        lstm_out, hidden = self.lstm(x.view(len(x), len(x[0][0]), -1), hidden)
        
        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]
        
        # Fully connected layer
        output = self.actv(self.fc(last_output))
        
        return output.squeeze(1)
    
    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))

# 모델 학습

In [48]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model

In [49]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

# 학습 진행

In [None]:
model = BaseModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

# 모델 추론

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

# 제출 파일로 변환

In [None]:
submit = pd.read_csv('./open/sample_submission.csv')
submit.head()

In [None]:
submit.to_csv('./last10.csv', index=False)