In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from konlpy.tag import Mecab
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline

from lightgbm import LGBMRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from ngboost import NGBRegressor
from catboost import CatBoostRegressor, Pool

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sub = pd.read_csv('./data/sample_submission.csv')

In [3]:
### 1. 요일,일자, 년, 월, 일, 주 변환

def transform_day_to_num(x) :
    if x == '월' :
        return 5
    elif x == '화' :
        return 4
    elif x == '수' :
        return 3
    elif x == '목' :
        return 2
    else :
        return 1        

train['요일'] = train['요일'].apply(transform_day_to_num)
test['요일'] = test['요일'].apply(transform_day_to_num)

In [4]:
train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

train['년'] = train['일자'].dt.year
test['년'] = test['일자'].dt.year

train['월'] = train['일자'].dt.month
test['월'] = test['일자'].dt.month

train['일'] = train.일자.dt.day
test['일'] = test.일자.dt.day

train['주'] = train.일자.dt.week
test['주'] = test.일자.dt.week

In [5]:
# 현본사소속재택근무자수를 이용해 코로나 단계를 나눔

def transform_corona(x) :
    if x <1 :
        return 3
    elif x < 134:
        return 2
    elif x < 221 :
        return 1
    else :
        return 0

train['코로나단계']=train['현본사소속재택근무자수'].apply(transform_corona)
test['코로나단계']=test['현본사소속재택근무자수'].apply(transform_corona)

In [6]:
# 휴가 비율을 수치로 하여 칼럼 생성

train['휴가비율']=(train.본사휴가자수/train.본사정원수)*100
test['휴가비율']=(test.본사휴가자수/test.본사정원수)*100

def transform_rest(x) :
    if x <2.569236 :
        return 0
    elif x< 3.734756:
        return 1
    elif x< 6.562848:
        return 2
    else :
        return 3

train['휴가퍼센트']=train['휴가비율'].apply(transform_rest)
test['휴가퍼센트']=test['휴가비율'].apply(transform_rest)

In [7]:
# 공휴일을 직접 수작업으로 지정하고 인코딩...

train['공휴일전후'] = 0
test['공휴일전후'] = 0

train['공휴일전후'][17] = 1
train['공휴일전후'][3] = 1
train['공휴일전후'][62] = 1
# train['공휴일전후'][67] = 1
# train['공휴일전후'][82] = 1
train['공휴일전후'][131] = 1
# train['공휴일전후'][130] = 1
train['공휴일전후'][152] = 1
train['공휴일전후'][226] = 1
train['공휴일전후'][221] = 1
train['공휴일전후'][224] = 1
# train['공휴일전후'][244] = 1
train['공휴일전후'][245] = 1
# train['공휴일전후'][267] = 1
train['공휴일전후'][310] = 2
train['공휴일전후'][311] = 1
train['공휴일전후'][309] = 1
train['공휴일전후'][330] = 1
train['공휴일전후'][379] = 1
train['공휴일전후'][467] = 1
# train['공휴일전후'][469] = 1
train['공휴일전후'][470] = 1
train['공휴일전후'][502] = 2
# train['공휴일전후'][501] = 1
# train['공휴일전후'][511] = 1
train['공휴일전후'][565] = 1
train['공휴일전후'][623] = 1
train['공휴일전후'][651] = 1
# train['공휴일전후'][650] = 1
train['공휴일전후'][705] = 1
# train['공휴일전후'][707] = 1
train['공휴일전후'][709] = 1
# train['공휴일전후'][733] = 1
# train['공휴일전후'][748] = 1
# train['공휴일전후'][792] = 1
train['공휴일전후'][815] = 1
train['공휴일전후'][864] = 1
# train['공휴일전후'][863] = 1
train['공휴일전후'][950] = 1
train['공휴일전후'][951] = 1
train['공휴일전후'][953] = 1
train['공휴일전후'][954] = 1
train['공휴일전후'][955] = 1
train['공휴일전후'][971] = 2
# train['공휴일전후'][970] = 1
# train['공휴일전후'][1037] = 1
train['공휴일전후'][1038] = 1
train['공휴일전후'][1099] = 1
train['공휴일전후'][1129] = 1
# train['공휴일전후'][1128] = 1
train['공휴일전후'][1187] = 1
# train['공휴일전후'][1186] = 1

test['공휴일전후'][10] =2
test['공휴일전후'][20] = 1

##원핫인코딩##

train = pd.get_dummies(train, columns=['공휴일전후'])
test = pd.get_dummies(test, columns=['공휴일전후'])

test['공휴일전후_0'][20] =1
test['공휴일전후_1'][20] = 0

In [8]:
# 메뉴 토큰화 전 처리

train['조식메뉴토큰'] = train['조식메뉴'].str.split(' ')
train['중식메뉴토큰'] = train['중식메뉴'].str.split(' ')
train['석식메뉴토큰'] = train['석식메뉴'].str.split(' ')

test['조식메뉴토큰'] = test['조식메뉴'].str.split(' ')
test['중식메뉴토큰'] = test['중식메뉴'].str.split(' ')
test['석식메뉴토큰'] = test['석식메뉴'].str.split(' ')

In [9]:
# 메뉴 토큰화 함수
def get_menu_comp_cnt(data) :
    menu_cnt = []
    for token in data :
        comp_cnt = 0
        for text in token :
            if (len(text) > 1) & (text.startswith("(") != 1) :
                comp_cnt += 1
            else :
                pass
        menu_cnt.append(comp_cnt)
    return menu_cnt

In [10]:
train['조식메뉴수'] = get_menu_comp_cnt(train['조식메뉴토큰'])
train['중식메뉴수'] = get_menu_comp_cnt(train['중식메뉴토큰'])
train['석식메뉴수'] = get_menu_comp_cnt(train['석식메뉴토큰'])

test['조식메뉴수'] = get_menu_comp_cnt(test['조식메뉴토큰'])
test['중식메뉴수'] = get_menu_comp_cnt(test['중식메뉴토큰'])
test['석식메뉴수'] = get_menu_comp_cnt(test['석식메뉴토큰'])

In [11]:
def get_menu_nunique(data) :
    menu_n_list = []
    for token in data :
        menu_n = 0
        for text in token :
            if  '/' in text  :
                menu_nunique = text.count('/') + 1
                menu_n += menu_nunique
            else :
                pass
        menu_n_list.append(menu_n)
    return menu_n_list

In [12]:
train['석식선택메뉴'] = get_menu_nunique(train['석식메뉴토큰'])
train['중식선택메뉴'] = get_menu_nunique(train['중식메뉴토큰'])
train['조식선택메뉴'] = get_menu_nunique(train['조식메뉴토큰'])

test['석식선택메뉴'] = get_menu_nunique(test['석식메뉴토큰'])
test['중식선택메뉴'] = get_menu_nunique(test['중식메뉴토큰'])
test['조식선택메뉴'] = get_menu_nunique(test['조식메뉴토큰'])

In [13]:
def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return x_

train['중식메뉴_split'] = train['중식메뉴'].apply(lambda x: get_food_embedding(x))
train['석식메뉴_split'] = train['석식메뉴'].apply(lambda x: get_food_embedding(x))

test['중식메뉴_split'] = test['중식메뉴'].apply(lambda x: get_food_embedding(x))
test['석식메뉴_split'] = test['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [14]:
train.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,...,중식메뉴토큰,석식메뉴토큰,조식메뉴수,중식메뉴수,석식메뉴수,석식선택메뉴,중식선택메뉴,조식선택메뉴,중식메뉴_split,석식메뉴_split
0,2016-02-01,5,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",...,"[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 오징어찌개, , 쇠불고기, (쇠고기:호주산...","[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 육개장, , 자반고등어구이, , 두부조림,...",7,7,6,2,2,7,"[쌀밥, 계란찜, 청포묵무침, 포기김치, 요구르트, 잡곡밥, 오징어찌개, 쇠불고기]","[두부조림, 쌀밥, 육개장, 포기김치, 잡곡밥, 자반고등어구이, 건파래무침]"
1,2016-02-02,4,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",...,"[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 김치찌개, , 가자미튀김, , 모둠소세지구...","[콩나물밥*양념장, (쌀,현미흑미:국내산), 어묵국, , 유산슬, (쇠고기:호주산)...",7,7,6,0,2,7,"[모둠소세지구이, 쌀밥, 마늘쫑무침, 요구르트, 잡곡밥, 배추겉절이, 가자미튀김, ...","[콩나물밥*양념장, 어묵국, 포기김치, 아삭고추무침, 바나나, 유산슬]"
2,2016-02-03,3,2601,56,180,111,0.0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",...,"[카레덮밥, (쌀,현미흑미:국내산), 팽이장국, , 치킨핑거, (닭고기:국내산), ...","[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 청국장찌개, , 황태양념구이, (황태:러시...",7,7,6,2,0,7,"[포기김치, 요구르트, 견과류조림, 치킨핑거, 쫄면야채무침, 팽이장국, 카레덮밥]","[쌀밥, 황태양념구이, 새송이버섯볶음, 포기김치, 잡곡밥, 청국장찌개, 고기전]"
3,2016-02-04,2,2601,104,220,355,0.0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",...,"[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 쇠고기무국, , 주꾸미볶음, , 부추전, ...","[미니김밥*겨자장, (쌀,현미흑미:국내산), 우동, , 멕시칸샐러드, , 군고구마,...",7,7,6,0,2,7,"[쇠고기무국, 주꾸미볶음, 쌀밥, 시금치나물, 포기김치, 요구르트, 잡곡밥, 부추전]","[미니김밥*겨자장, 포기김치, 우동, 멕시칸샐러드, 군고구마, 무피클]"
4,2016-02-05,1,2601,278,181,34,0.0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",...,"[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 떡국, , 돈육씨앗강정, (돼지고기:국내산...","[쌀밥/잡곡밥, (쌀,현미흑미:국내산), 차돌박이찌개, (쇠고기:호주산), 닭갈비,...",7,7,6,2,2,7,"[쌀밥, 포기김치, 떡국, 요구르트, 잡곡밥, 청경채무침, 돈육씨앗강정, 우엉잡채]","[차돌박이찌개, 쌀밥, 포기김치, 잡곡밥, 감자소세지볶음, 닭갈비, 콩나물무침]"


In [15]:
# 출근 인원 퍼센트

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])

for i in np.arange(0.25,1,0.25):
    print(int(i*100),'%: ',train.출근.quantile(q=i))

def f1(x) :
    if x <2281.0 :
        return 0
    elif x< 2357.0:
        return 1
    elif x< 2461.0:
        return 2
    else :
        return 3

train['출근퍼센트']=train['출근'].apply(f1)
test['출근퍼센트']=test['출근'].apply(f1)

25 %:  2281.0
50 %:  2357.0
75 %:  2461.0


In [16]:
# 계절, 주, 월초

def transform_season(x) :
    if 3<=x<=5 :
        return '봄'
    elif 6<=x<=8:
        return '여름'
    elif 9<=x<=11 :
        return '가을'
    else :
        return '겨울'

train['월_계절']=train['월'].apply(transform_season)
test['월_계절']=test['월'].apply(transform_season)

train = pd.concat([train, pd.get_dummies(train['월_계절'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['월_계절'])], axis=1)

In [17]:
def transform_week(x) :
    if 9<=x<=22 :
        return '주_봄'
    elif 23<=x<=35:
        return '주_여름'
    elif 36<=x<=48 :
        return '주_가을'
    else :
        return '주_겨울'

train['주_계절']=train['주'].apply(transform_week)
test['주_계절']=test['주'].apply(transform_week)

train = pd.concat([train, pd.get_dummies(train['주_계절'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['주_계절'])], axis=1)

test['주_가을']=0
test['주_여름']=0
test['가을']=0
test['여름']=0

In [18]:
def transform_day(x) :
    if 1<=x<=10 :
        return '초_일'
    elif 11<=x<=20:
        return '중_일'
    else :
        return '말_일'

train['초중말일자']=train['일'].apply(transform_day)
test['초중말일자']=test['일'].apply(transform_day)

train = pd.concat([train, pd.get_dummies(train['초중말일자'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['초중말일자'])], axis=1)

In [20]:
X1 = train[['월','년','요일','휴가퍼센트','코로나단계','말_일','중_일','공휴일전후_0','초_일','중식메뉴수','공휴일전후_1','주_봄','봄','주_가을','가을','주_겨울','겨울','공휴일전후_2']]
y1 = train.중식계
target1 = test[X1.columns]

X2 = train[['요일', '년', '월', '코로나단계', '휴가퍼센트', '공휴일전후_0', '공휴일전후_1', '공휴일전후_2', '석식메뉴수', '출근퍼센트', '겨울', '봄', '주_겨울', '주_가을', '가을', '말_일', '초_일','본사시간외근무명령서승인건수']]
y2 = train.석식계
target2 = test[X2.columns]

In [21]:
kf = KFold(n_splits = 15, random_state = 607, shuffle = True)