In [2]:
!pip install xgboost



In [3]:
import xgboost as xgb
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

In [4]:
# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

In [5]:
def preprocessing(temp_df, pum, len_lag) :
    # p_lag, q_lag 추가
    for lag in range(1,len_lag+1) :
      temp_df[f'p_lag_{lag}'] = -1
      temp_df[f'q_lag_{lag}'] = -1
      for index in range(lag, len(temp_df)) :
        temp_df.loc[index, f'p_lag_{lag}'] = temp_df[f'{pum}_가격(원/kg)'][index-lag] #1일전, 2일전, ... 가격을 feature로 추가
        temp_df.loc[index, f'q_lag_{lag}'] = temp_df[f'{pum}_거래량(kg)'][index-lag] #1일전, 2일전, ... 거래량을 feature로 추가

    # month 추가
    temp_df['date'] = pd.to_datetime(temp_df['date'])
    temp_df['month'] = temp_df['date'].dt.month

    # 예측 대상(1w,2w,4w) 추가
    for week in ['1_week','2_week','4_week'] :
      temp_df[week] = 0
      n_week = int(week[0])
      for index in range(len(temp_df)) :
        try : temp_df[week][index] = temp_df[f'{pum}_가격(원/kg)'][index+7*n_week]
        except : continue

    # 불필요한 column 제거        
    temp_df = temp_df.drop(['date',f'{pum}_거래량(kg)',f'{pum}_가격(원/kg)'], axis=1)
    
    return temp_df

In [6]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/public_data/train.csv')
train.head()

Unnamed: 0,date,요일,배추_거래량(kg),배추_가격(원/kg),무_거래량(kg),무_가격(원/kg),양파_거래량(kg),양파_가격(원/kg),건고추_거래량(kg),건고추_가격(원/kg),마늘_거래량(kg),마늘_가격(원/kg),대파_거래량(kg),대파_가격(원/kg),얼갈이배추_거래량(kg),얼갈이배추_가격(원/kg),양배추_거래량(kg),양배추_가격(원/kg),깻잎_거래량(kg),깻잎_가격(원/kg),시금치_거래량(kg),시금치_가격(원/kg),미나리_거래량(kg),미나리_가격(원/kg),당근_거래량(kg),당근_가격(원/kg),파프리카_거래량(kg),파프리카_가격(원/kg),새송이_거래량(kg),새송이_가격(원/kg),팽이버섯_거래량(kg),팽이버섯_가격(원/kg),토마토_거래량(kg),토마토_가격(원/kg),청상추_거래량(kg),청상추_가격(원/kg),백다다기_거래량(kg),백다다기_가격(원/kg),애호박_거래량(kg),애호박_가격(원/kg),캠벨얼리_거래량(kg),캠벨얼리_가격(원/kg),샤인마스캇_거래량(kg),샤인마스캇_가격(원/kg)
0,2016-01-01,금요일,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2016-01-02,토요일,80860.0,329.0,80272.0,360.0,122787.5,1281.0,3.0,11000.0,15019.0,5475.0,92334.0,1704.0,6359.0,1331.0,40028.0,348.0,4374.9,13242.0,16550.5,2339.0,10528.0,1729.0,13885.0,804.0,3853.0,3703.0,15797.0,2576.0,14634.0,1474.0,30950.0,1621.0,5125.0,9235.0,434.0,2109.0,19159.0,2414.0,880.0,2014.0,0.0,0.0
2,2016-01-03,일요일,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2016-01-04,월요일,1422742.5,478.0,1699653.7,382.0,2315079.0,1235.0,699.0,4464.0,141638.0,5210.0,994328.1,1716.0,262615.5,1212.0,1074699.1,345.0,122613.5,9923.0,427435.1,2153.0,82113.5,3960.0,558950.2,794.0,104930.3,4871.0,277326.5,2440.0,159800.0,1750.0,291057.0,1834.0,38525.5,7631.0,500702.0,2046.0,620539.0,2018.0,2703.8,3885.0,0.0,0.0
4,2016-01-05,화요일,1167241.0,442.0,1423482.3,422.0,2092960.1,1213.0,1112.6,4342.0,126207.8,5387.0,787716.0,1715.0,221850.5,1197.0,825681.9,350.0,79055.9,9529.0,334636.8,2220.0,80144.0,3333.0,444353.7,763.0,100699.5,5129.0,218465.2,2437.0,153084.0,1822.0,194626.5,1833.0,32615.0,6926.0,147638.0,2268.0,231958.0,2178.0,8810.0,2853.0,0.0,0.0


In [7]:
# preprocessing 함수 예시
pum = '배추'
temp_df = train[['date',f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
preprocessing(temp_df, pum, len_lag=28)

Unnamed: 0,p_lag_1,q_lag_1,p_lag_2,q_lag_2,p_lag_3,q_lag_3,p_lag_4,q_lag_4,p_lag_5,q_lag_5,p_lag_6,q_lag_6,p_lag_7,q_lag_7,p_lag_8,q_lag_8,p_lag_9,q_lag_9,p_lag_10,q_lag_10,p_lag_11,q_lag_11,p_lag_12,q_lag_12,p_lag_13,q_lag_13,p_lag_14,q_lag_14,p_lag_15,q_lag_15,p_lag_16,q_lag_16,p_lag_17,q_lag_17,p_lag_18,q_lag_18,p_lag_19,q_lag_19,p_lag_20,q_lag_20,p_lag_21,q_lag_21,p_lag_22,q_lag_22,p_lag_23,q_lag_23,p_lag_24,q_lag_24,p_lag_25,q_lag_25,p_lag_26,q_lag_26,p_lag_27,q_lag_27,p_lag_28,q_lag_28,month,1_week,2_week,4_week
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,420,449,625
1,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,389,454,733
2,329.0,80860.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0,0,1048
3,0.0,0.0,329.0,80860.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,398,475,638
4,478.0,1422742.5,0.0,0.0,329.0,80860.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,431,511,597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1728,1807.0,2007471.3,1838.0,1757465.6,1813.0,2046286.3,2925.0,1959.0,1812.0,1346091.4,1839.0,1624514.7,1983.0,1459130.1,1939.0,1532777.2,2017.0,1341387.6,2042.0,1221538.6,0.0,0.0,1748.0,968110.0,1576.0,1156811.6,1542.0,1302290.8,1585.0,1181782.2,1994.0,540843.7,1614.0,975020.2,0.0,0.0,1329.0,1104424.8,1358.0,698187.5,1445.0,895628.0,1150.0,1144746.8,1093.0,1279591.6,1133.0,1441152.8,0.0,0.0,1476.0,760499.0,1564.0,763266.0,1561.0,1020033.2,9,0,0,0
1729,1839.0,1856965.0,1807.0,2007471.3,1838.0,1757465.6,1813.0,2046286.3,2925.0,1959.0,1812.0,1346091.4,1839.0,1624514.7,1983.0,1459130.1,1939.0,1532777.2,2017.0,1341387.6,2042.0,1221538.6,0.0,0.0,1748.0,968110.0,1576.0,1156811.6,1542.0,1302290.8,1585.0,1181782.2,1994.0,540843.7,1614.0,975020.2,0.0,0.0,1329.0,1104424.8,1358.0,698187.5,1445.0,895628.0,1150.0,1144746.8,1093.0,1279591.6,1133.0,1441152.8,0.0,0.0,1476.0,760499.0,1564.0,763266.0,9,0,0,0
1730,1789.0,1880095.5,1839.0,1856965.0,1807.0,2007471.3,1838.0,1757465.6,1813.0,2046286.3,2925.0,1959.0,1812.0,1346091.4,1839.0,1624514.7,1983.0,1459130.1,1939.0,1532777.2,2017.0,1341387.6,2042.0,1221538.6,0.0,0.0,1748.0,968110.0,1576.0,1156811.6,1542.0,1302290.8,1585.0,1181782.2,1994.0,540843.7,1614.0,975020.2,0.0,0.0,1329.0,1104424.8,1358.0,698187.5,1445.0,895628.0,1150.0,1144746.8,1093.0,1279591.6,1133.0,1441152.8,0.0,0.0,1476.0,760499.0,9,0,0,0
1731,1760.0,1661090.9,1789.0,1880095.5,1839.0,1856965.0,1807.0,2007471.3,1838.0,1757465.6,1813.0,2046286.3,2925.0,1959.0,1812.0,1346091.4,1839.0,1624514.7,1983.0,1459130.1,1939.0,1532777.2,2017.0,1341387.6,2042.0,1221538.6,0.0,0.0,1748.0,968110.0,1576.0,1156811.6,1542.0,1302290.8,1585.0,1181782.2,1994.0,540843.7,1614.0,975020.2,0.0,0.0,1329.0,1104424.8,1358.0,698187.5,1445.0,895628.0,1150.0,1144746.8,1093.0,1279591.6,1133.0,1441152.8,0.0,0.0,9,0,0,0


In [8]:
def nmae(week_answer, week_submission):
    answer = week_answer
    target_idx = np.where(answer!=0)
    true = answer[target_idx]
    pred = week_submission[target_idx]
    score = np.mean(np.abs(true-pred)/true)
    
    return score


def at_nmae(pred, dataset):
    y_true = dataset.get_label()
    week_1_answer = y_true[0::3]
    week_2_answer = y_true[1::3]
    week_4_answer = y_true[2::3]
    
    week_1_submission = pred[0::3]
    week_2_submission = pred[1::3]
    week_4_submission = pred[2::3]
    
    score1 = nmae(week_1_answer, week_1_submission)
    score2 = nmae(week_2_answer, week_2_submission)
    score4 = nmae(week_4_answer, week_4_submission)
    
    score = (score1+score2+score4)/3
    
    return 'score', score, False



In [9]:
def model_train(x_train, y_train, x_valid, y_valid) :
    
    params = {
        # Parameters that we are going to tune
        'max_depth':6,
        'min_child_weight': 1,
        'eta':.3,
        'subsample': 1,
        'colsample_bytree': 1,
        # Other parameters
        'objective':'reg:linear',
            }

    model = xgb.train(params, 
                   train_set = xgb.DMatrix(data = x_train, label = y_train),
                   num_boost_round = 10000, 
                   valid_sets = xgb.DMatrix(data = x_valid, label = y_valid), 
                   init_model = None, 
                   early_stopping_rounds = 100,
                   feval = at_nmae,
                   verbose_eval = False
                    )
    
    return model

In [10]:
unique_pum = [
    '배추', '무', '양파', '건고추','마늘',
    '대파', '얼갈이배추', '양배추', '깻잎',
    '시금치', '미나리', '당근',
    '파프리카', '새송이', '팽이버섯', '토마토',
]

unique_kind = [
    '청상추', '백다다기', '애호박', '캠벨얼리', '샤인마스캇'
]

In [13]:
model_dict = {}
split = 28 #validation

for pum in tqdm(unique_pum + unique_kind):
    # 품목 품종별 전처리
    temp_df = train[['date',f'{pum}_거래량(kg)', f'{pum}_가격(원/kg)']]
    temp_df = preprocessing(temp_df, pum, len_lag=28)
    
    # 주차별(1,2,4w) 학습
    for week_num in [1,2,4] :
        x = temp_df[temp_df[f'{week_num}_week']>0].iloc[:,:-3]
        y = temp_df[temp_df[f'{week_num}_week']>0][f'{week_num}_week']
        
        #train, test split
        x_train = x[:-split]
        y_train = y[:-split]
        x_valid = x[-split:]
        y_valid = y[-split:]

       
        
        model_dict[f'{pum}_model_{week_num}'] = model_train(x_train, y_train,x_valid,y_valid)

  0%|          | 0/21 [00:30<?, ?it/s]


TypeError: ignored