In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import datetime
from sklearn.preprocessing import LabelEncoder
import gc

import sklearn
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from tqdm.notebook import tqdm

#한글깨짐방지
plt.rc('font',family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

  import pandas.util.testing as tm
  config.update(yaml.load(text) or {})


In [2]:
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 1.0.5
Numpy : 1.18.5
Scikit-Learn : 0.23.1


Python 3.6.5 :: Anaconda, Inc.


### data

In [3]:
#데이터로드
path = './data/'
data = pd.read_csv(path +'201901-202003.csv')
submission = pd.read_csv(path + 'submission.csv')

In [4]:
data.shape, submission.shape

((24697792, 12), (1394, 5))

### 전처리

#### 결측처리

In [5]:
# 시군구 결측은 모두 세종
data.loc[data.CARD_CCG_NM.isna(),'CARD_CCG_NM'] = '세종'
data.loc[data.HOM_CCG_NM.isna(),'HOM_CCG_NM'] = '세종'

### 추가변수

#### 18 사업체총조사

In [6]:
all_jobs_df = pd.read_csv('./data/외부데이터/18 사업체총조사_키워드매칭.csv',encoding = 'cp949')
all_jobs_df = all_jobs_df[['행정구역별','데이터업종명','사업체수 (개)', '종사자수 (명)']]
all_jobs_df.columns = ['CARD_SIDO_NM','STD_CLSS_NM', 'company','employee']
all_jobs_df.employee = all_jobs_df.employee.apply(lambda x : x.replace("X","0")).astype(int)
all_jobs_df = all_jobs_df.groupby(['CARD_SIDO_NM','STD_CLSS_NM'])[['company','employee']].sum().reset_index()

all_jobs_df = all_jobs_df[~(all_jobs_df.CARD_SIDO_NM == '전국')]

#### 지역내 각 업종이 차지하는 매출비중

In [7]:
def local_percent(data):
    pivoted = data.pivot_table(index = ['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM'],columns ='REG_YYMM',values= "AMT",aggfunc='sum').reset_index()
    pivoted.fillna(0,inplace= True)

    #각 지역별(시군구)업종의 매출비율을 저장
    sidos = pivoted.CARD_SIDO_NM.unique()

    local_percent_df = pd.DataFrame()
    for sido in sidos:
        sample_sido = pivoted[pivoted.CARD_SIDO_NM == sido]
        sigungus = sample_sido.CARD_CCG_NM.unique()
        for sigungu in sigungus:
            sample = sample_sido[sample_sido.CARD_CCG_NM == sigungu]
            sum_arr = sample.iloc[:,3:].sum().values.reshape(1,-1)
            parcent_df = (sample.iloc[:,3:]/sum_arr)*100

            fin = pd.concat([sample.iloc[:,:3],parcent_df],axis=1)
            local_percent_df = pd.concat([local_percent_df,fin])

    #신규변수 LP(local percent) : 지역내 각 업종이 차지하는 매출비중
    local_percent_df = local_percent_df.melt(id_vars=['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM'],value_name='LP')
    return local_percent_df

#### 전체업종에서 각 지역업종이 차지하는 매출비중

In [8]:
def job_percent(data):
    pivoted = data.pivot_table(index = ['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM'],columns ='REG_YYMM',values= "AMT",aggfunc='sum').reset_index()
    pivoted.fillna(0,inplace= True)

    all_pivoted = data.pivot_table(index = ['STD_CLSS_NM'],columns ='REG_YYMM',values= "AMT",aggfunc= 'sum').reset_index()
    all_pivoted.fillna(0,inplace= True)
    all_pivoted = all_pivoted.set_index('STD_CLSS_NM')

    job_percent_df = pd.DataFrame()
    for job in all_pivoted.index:
        taget_df = pivoted[pivoted.STD_CLSS_NM == job]
        tail = (taget_df.iloc[:,3:]/all_pivoted.iloc[0].values)*100
        head = taget_df.iloc[:,:3]
        full = pd.concat([head,tail],axis =1)
        job_percent_df = pd.concat([job_percent_df,full])

    #신규변수 JP(Job percent): 전체업종에서 해당지역의 업종이 차이하는 비율
    job_percent_df = job_percent_df.melt(id_vars=['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM'],value_name='JP')
    return job_percent_df

In [9]:
local_percent_df = local_percent(data)
job_percent_df = job_percent(data)

#### 추가변수 결합

In [10]:
data.shape

(24697792, 12)

In [11]:
grouped = pd.merge(data,all_jobs_df,on =['CARD_SIDO_NM','STD_CLSS_NM'],how = 'left')
grouped = pd.merge(grouped,local_percent_df,on =['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM','REG_YYMM'],how = 'left')
grouped = pd.merge(grouped,job_percent_df,on =['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM','REG_YYMM'],how = 'left')

In [12]:
grouped.shape

(24697792, 16)

In [13]:
#예측시 nan이 많이 발생할 변수를 제외 
del grouped['HOM_CCG_NM']

In [14]:
#서울중구, 대구중구 동일 지명 방지하기 위해 묶어서 인코딩 추후 split해서 병합하면됨
grouped['address'] = grouped['CARD_SIDO_NM'] +"_"+ grouped['CARD_CCG_NM']
grouped.drop(['CARD_SIDO_NM','CARD_CCG_NM'],axis=1, inplace=True)

In [15]:
grouped.head(1)

Unnamed: 0,REG_YYMM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,company,employee,LP,JP,address
0,201901,건강보조식품 소매업,강원,20s,1,1,4,311200,4,263.0,399.0,0.138432,0.279221,강원_강릉시


### 인코딩

#### 날짜변환

In [17]:
grouped['year'] = grouped['REG_YYMM'].apply(lambda x : int(str(x)[:4]))
grouped['month'] = grouped['REG_YYMM'].apply(lambda x : int(str(x)[4:]))
grouped.drop(['REG_YYMM'],axis= 1,inplace = True)

In [18]:
#인코딩
def encoding(data):
    dtypes = grouped.dtypes
    encoders = {}
    for column in grouped.columns:
        if str(dtypes[column]) == 'object':
            encoder = LabelEncoder()
            encoder.fit(grouped[column])
            encoders[column] = encoder
    for column in encoders.keys():
        encoder = encoders[column]
        grouped[column] = encoder.transform(grouped[column])
    return encoders, data

In [19]:
encoders, grouped = encoding(grouped)

In [20]:
encoders

{'STD_CLSS_NM': LabelEncoder(),
 'HOM_SIDO_NM': LabelEncoder(),
 'AGE': LabelEncoder(),
 'address': LabelEncoder()}

#### 용량변환

In [16]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [21]:
#메모리관리
grouped = reduce_mem_usage(grouped)

del data
gc.collect()

Mem. usage decreased to 1036.36 Mb (60.7% reduction)


52

### Model Tuning & Evaluation

In [22]:
params = {'random_state': 42,
          'learning_rate': 0.01,
          'max_depth': 8,
          'num_leaves': 64,
          'boosting': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'feature_fraction': 0.9,
          'bagging_fraction': 0.7}

use_col = ['STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'address', 'year', 'month', 
           'LP', 'JP','company','employee']
categorical_features = ['address','STD_CLSS_NM','HOM_SIDO_NM','SEX_CTGO_CD','AGE','FLC']

### predict

In [23]:
#train to private (4월예측모델)
#20년 1월까지 데이터만 사용
p_train = pd.concat([grouped[grouped.year == 2019],grouped[(grouped.year == 2020)&(grouped.month == 1)]])

In [24]:
# Feature, taget 설정
# train_features = p_train.drop(['CSTMR_CNT','AMT','CNT'],axis= 1)
# train_target = np.log1p(p_train['AMT'])

In [62]:
train_ds = lgb.Dataset(p_train[use_col],label=np.log1p(p_train['AMT']))
model = lgb.train(params,
                  train_ds,
                  valid_sets = [train_ds],
                  valid_names=['train'], 
                  num_boost_round=10000,
                  verbose_eval=500,
                  early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[500]	train's rmse: 1.42089
[1000]	train's rmse: 1.36153
[1500]	train's rmse: 1.32683
[2000]	train's rmse: 1.30792
[2500]	train's rmse: 1.29406
[3000]	train's rmse: 1.28429
[3500]	train's rmse: 1.27661
[4000]	train's rmse: 1.27029
[4500]	train's rmse: 1.26471
[5000]	train's rmse: 1.26019
[5500]	train's rmse: 1.2562
[6000]	train's rmse: 1.25279
[6500]	train's rmse: 1.24936
[7000]	train's rmse: 1.2464
[7500]	train's rmse: 1.24393
[8000]	train's rmse: 1.24153
[8500]	train's rmse: 1.23937
[9000]	train's rmse: 1.23737
[9500]	train's rmse: 1.23543
[10000]	train's rmse: 1.2337
Did not meet early stopping. Best iteration is:
[10000]	train's rmse: 1.2337


In [63]:
del p_train

### 예측탬플릿으로 predict

In [72]:
#new(예측탬플릿), 4월 7월 나눠서 진행
def make_temp(grouped):
    addresses = grouped['address'].unique()
    STD_CLSS_NMs = grouped['STD_CLSS_NM'].unique()
    HOM_SIDO_NMs = grouped['HOM_SIDO_NM'].unique()
    AGEs = grouped['AGE'].unique()
    SEX_CTGO_CDs = grouped['SEX_CTGO_CD'].unique()
    FLCs = grouped['FLC'].unique()
    years = [2020]
    months = [4,7]

    temp = []
    for address in tqdm(addresses):
        for STD_CLSS_NM in STD_CLSS_NMs:
            for HOM_SIDO_NM in HOM_SIDO_NMs:
                for AGE in AGEs:
                    for SEX_CTGO_CD in SEX_CTGO_CDs:
                        for FLC in FLCs:
                            for year in years:
                                for month in months:
                                    temp.append([address, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
    temp = np.array(temp)
    temp = pd.DataFrame(data=temp, columns = ['address','STD_CLSS_NM','HOM_SIDO_NM','AGE','SEX_CTGO_CD','FLC','year','month'])
    
    #추가변수 삽입
    #변수삽입을 위한 디코더
    temp['address'] = encoders['address'].inverse_transform(temp['address'])
    temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

    temp['CARD_SIDO_NM'] = temp['address'].apply(lambda x: x.split('_')[0])
    temp['CARD_CCG_NM'] = temp['address'].apply(lambda x: x.split('_')[1])
    temp['REG_YYMM'] = temp['year']*100 +temp['month']

    temp = pd.merge(temp,all_jobs_df,on =['CARD_SIDO_NM','STD_CLSS_NM'],how = 'left')
    temp = pd.merge(temp,local_percent_df,on =['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM','REG_YYMM'],how = 'left')
    temp = pd.merge(temp,job_percent_df,on =['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM','REG_YYMM'],how = 'left')

    temp['address'] = encoders['address'].transform(temp['address'])
    temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].transform(temp['STD_CLSS_NM'])
    temp.drop(['CARD_SIDO_NM','CARD_CCG_NM','REG_YYMM'],axis = 1,inplace = True)
    
    return temp

In [73]:
def temp_predict(temp):
    # 예측
    pred = model.predict(temp)
    pred = np.expm1(pred)
    
    #예측변수삽입
    temp['address'] = encoders['address'].inverse_transform(temp['address'])
    temp['AMT'] = np.round(pred,0)
    temp['REG_YYMM'] = temp['year']*100 +temp['month']
    temp['CARD_SIDO_NM'] = temp['address'].apply(lambda x: x.split('_')[0])
    temp['CARD_CCG_NM'] = temp['address'].apply(lambda x: x.split('_')[1])

    temp = temp[['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM','AMT']]
    temp = temp.groupby(['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM']).sum().reset_index(drop=False)
    
    return temp

In [74]:
temp = make_temp(grouped)
temp= temp_predict(temp)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




#### 후처리

In [69]:
temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

In [70]:
#18년 사업체총조사 기준, 그 지역에 없는 업종은 0처리 
temp = pd.merge(temp,all_jobs_df,left_on=['CARD_SIDO_NM','STD_CLSS_NM'],right_on=['CARD_SIDO_NM','STD_CLSS_NM'],how='left')
temp.loc[temp.company.isna(),'AMT'] = 0

temp.drop(['company','employee'],axis=1,inplace=True)

### 저장

In [71]:
#제출파일
submission = pd.read_csv('data/submission.csv',index_col=0)
submission = submission.drop(['AMT'],axis= 1)
submission = submission.merge(temp, left_on=['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM'],right_on =['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM'],how = 'left')
submission.index.name = 'id'
today = datetime.datetime.now().date()
submission.to_csv(f'./submission/{today}_submission_lgbm_test_left.csv', encoding='utf-8-sig')

submission.head()

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,7442320000.0
1,202004,강원,골프장 운영업,8258926000.0
2,202004,강원,과실 및 채소 소매업,2950913000.0
3,202004,강원,관광 민예품 및 선물용품 소매업,2804461000.0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,2385348000.0
