# Feature Engineering

## 수행해야 하는 사항

1. User_id unique, month를 기준으로 그룹화
2. JOIN_DATE 원핫인코딩
3. AD1 원핫인코딩
4. 교통량?

In [2]:
import os 
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'  # (Windows 용) 한글 출력을 위한 글꼴 설정

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

# 랜덤시드 통일
np.random.seed(42)

In [2]:
def get_font_family():
    """
    시스템 환경에 따른 기본 폰트명을 반환하는 함수
    """
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인

    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        # Linux
        # colab에서는 runtime을 <꼭> 재시작 해야함.
        # 런타임을 재시작 하지 않고 폰트 설치를 하면 기본 설정 폰트가 로드되어 한글이 깨짐.
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null

        import matplotlib.font_manager as fm

        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family

In [3]:
# 시각화를 위한 폰트설정
# 위에서 만든 함수를 통해 시스템 폰트를 불러와서 font_family 라는 변수에 할당.
a = get_font_family()
# 폰트설정
import matplotlib.pyplot as plt 
plt.rc("font", family = a)
# 마이너스폰트 설정
plt.rc("axes", unicode_minus=False)
# ggplot으로 그래프 스타일 설정 / 개인 자유
plt.style.use("ggplot")

In [4]:
# 데이터 확인
os.listdir('data')

['2020교통량통합.xlsx',
 'holiday.csv',
 'metro.csv',
 '국가공휴일.xlsx',
 '디지털 스킬셋 기술과제.docx',
 '서울시_기상데이터.csv',
 '실전db.csv',
 '실전db_holiday.csv',
 '지하철노선위경도정보3.xlsx']

In [5]:
# 데이터프레임 불러오기
df = pd.read_csv("./data/실전db.csv")

In [6]:
display(df.head())
print(df.shape)

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR


(879271, 8)


In [7]:
# 먼저 이상치 제거하기
df = df[df["USER_ID"] != 999665]
print(df.shape) # 878905로 줄어든 형태
df= df.reset_index()
df.drop("index", axis=1, inplace=True)
display(df)

(878905, 8)


Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR
...,...,...,...,...,...,...,...,...
878900,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN
878901,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY
878902,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR
878903,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J


In [8]:
# STORE_ID 제거하기
df = df.drop("STORE_ID", axis=1)

In [9]:
# MONTH 값 생성
df['MONTH'] = pd.to_datetime(df.DATE).dt.strftime('%m')
display(df.head(10))

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,AD1,MONTH
0,2858,2014-01-07,AA,A,2020-01-01,1,GN,1
1,5647,2014-02-14,BB,A,2020-01-01,1,J,1
2,33314,2014-11-20,BB,A,2020-01-01,1,SC,1
3,37001,2014-12-04,BB,C,2020-01-01,1,MP,1
4,37819,2014-12-07,AA,C,2020-01-01,1,JRR,1
5,45633,2015-01-04,BB,A,2020-01-01,1,YD,1
6,61985,2015-02-26,AA,A,2020-01-01,1,J,1
7,66488,2015-03-13,BB,A,2020-01-01,1,GJ,1
8,71927,2015-03-31,AA,A,2020-01-01,1,JRR,1
9,73825,2015-04-07,BB,C,2020-01-01,1,GN,1


In [10]:
# AD1 중 서울 내 지점 추정
seoul = """1.종로구, JR

2. 중구, J

3.용산구, YO

4 성동구,SOD

 5광진구,GJ

 6동대문구, DM

7.중랑구, JRR

8.성북구, SB

9.강북구, GB

10.도봉구, DB

11. 노원구, NW

12.은평구, EP

13.서대문구, SD

14.마포구, MP

15.양천구, YC

16.강서구, GS

17.구로구, GR

18. 금천구, GHN

 19.영등포구,YD

 20.동작구, DJ

21. 관악구, GW

 22.서초구, SC

 23.강남구,GN

24. 송파구, SP

25.강동구 GD"""

seoul = re.sub("[0-9.,가-힣 ]", "", seoul).split()
print(seoul)

['JR', 'J', 'YO', 'SOD', 'GJ', 'DM', 'JRR', 'SB', 'GB', 'DB', 'NW', 'EP', 'SD', 'MP', 'YC', 'GS', 'GR', 'GHN', 'YD', 'DJ', 'GW', 'SC', 'GN', 'SP', 'GD']


In [12]:
df['DATE']=pd.to_datetime(df['DATE'],format='%Y-%m-%d')

In [15]:
holiday_list = ['2020-01-01', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27', '2020-03-01', '2020-05-05', '2020-04-15', '2020-04-30', '2020-06-06', '2020-08-15', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25']

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878905 entries, 0 to 878904
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   USER_ID     878905 non-null  int64         
 1   JOIN_DATE   878905 non-null  object        
 2   D_TYPE      878905 non-null  object        
 3   GOODS_TYPE  878905 non-null  object        
 4   DATE        878905 non-null  datetime64[ns]
 5   COUNT       878905 non-null  int64         
 6   AD1         878905 non-null  object        
 7   MONTH       878905 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 53.6+ MB


In [20]:
df['DATE'].loc[1].strftime('%Y-%m-%d')

'2020-01-01'

In [22]:
# AD1_TYPE 파악
# DAY_TYPE 파악

holiday_list = ['2020-01-01', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27', '2020-03-01', '2020-05-05', '2020-04-15', '2020-04-30', '2020-06-06', '2020-08-15', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25']

ad1_type_list = []
day_type_list = []


for k in range(len(df)) :
    
    # AD1_TYPE 파악
    if df["AD1"].loc[k] in seoul :
        ad1_type_list.append("SEOUL")
    else :
        ad1_type_list.append("NOT_SEOUL")
        
    # DAY_TYPE 파악
    if df['DATE'].loc[k].strftime('%Y-%m-%d') in holiday_list :
        day_type_list.append("공휴일") 
        
    elif df['DATE'].loc[k].weekday() > 4 :
        day_type_list.append("주말")
        
    else :
        day_type_list.append("주중")    

In [24]:
df['AD1_TYPE'] = ad1_type_list
df['DAY_TYPE'] = day_type_list

In [25]:
display(df)

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,AD1,MONTH,AD1_TYPE,DAY_TYPE
0,2858,2014-01-07,AA,A,2020-01-01,1,GN,01,SEOUL,공휴일
1,5647,2014-02-14,BB,A,2020-01-01,1,J,01,SEOUL,공휴일
2,33314,2014-11-20,BB,A,2020-01-01,1,SC,01,SEOUL,공휴일
3,37001,2014-12-04,BB,C,2020-01-01,1,MP,01,SEOUL,공휴일
4,37819,2014-12-07,AA,C,2020-01-01,1,JRR,01,SEOUL,공휴일
...,...,...,...,...,...,...,...,...,...,...
878900,1830551,2020-12-31,BB,B,2020-12-31,1,GN,12,SEOUL,주중
878901,1830570,2020-12-31,BB,B,2020-12-31,1,CY,12,NOT_SEOUL,주중
878902,1830580,2020-12-31,AA,B,2020-12-31,1,JRR,12,SEOUL,주중
878903,1830589,2020-12-31,BB,B,2020-12-31,1,J,12,SEOUL,주중


In [26]:
# JOIN_DATE
df.JOIN_DATE = pd.to_datetime(df.JOIN_DATE)
df.JOIN_DATE = df.JOIN_DATE.dt.to_period(freq="A")
display(df)

# final.csv 에 저장

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,AD1,MONTH,AD1_TYPE,DAY_TYPE
0,2858,2014,AA,A,2020-01-01,1,GN,01,SEOUL,공휴일
1,5647,2014,BB,A,2020-01-01,1,J,01,SEOUL,공휴일
2,33314,2014,BB,A,2020-01-01,1,SC,01,SEOUL,공휴일
3,37001,2014,BB,C,2020-01-01,1,MP,01,SEOUL,공휴일
4,37819,2014,AA,C,2020-01-01,1,JRR,01,SEOUL,공휴일
...,...,...,...,...,...,...,...,...,...,...
878900,1830551,2020,BB,B,2020-12-31,1,GN,12,SEOUL,주중
878901,1830570,2020,BB,B,2020-12-31,1,CY,12,NOT_SEOUL,주중
878902,1830580,2020,AA,B,2020-12-31,1,JRR,12,SEOUL,주중
878903,1830589,2020,BB,B,2020-12-31,1,J,12,SEOUL,주중


In [None]:
join_type_df

In [28]:
# 박성준 최종 형태

final_df = df.drop(["DATE", "AD1","D_TYPE", 'GOODS_TYPE'], axis=1)
display(final_df)

Unnamed: 0,USER_ID,JOIN_DATE,COUNT,MONTH,AD1_TYPE,DAY_TYPE
0,2858,2014,1,01,SEOUL,공휴일
1,5647,2014,1,01,SEOUL,공휴일
2,33314,2014,1,01,SEOUL,공휴일
3,37001,2014,1,01,SEOUL,공휴일
4,37819,2014,1,01,SEOUL,공휴일
...,...,...,...,...,...,...
878900,1830551,2020,1,12,SEOUL,주중
878901,1830570,2020,1,12,NOT_SEOUL,주중
878902,1830580,2020,1,12,SEOUL,주중
878903,1830589,2020,1,12,SEOUL,주중


In [37]:
day_type_df = final_df.groupby(["USER_ID","MONTH","DAY_TYPE"]).sum()
display(day_type_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,COUNT
USER_ID,MONTH,DAY_TYPE,Unnamed: 3_level_1
224,01,주말,1
224,12,주중,1
232,01,공휴일,2
232,01,주말,1
232,01,주중,1
...,...,...,...
1830551,12,주중,1
1830570,12,주중,1
1830580,12,주중,1
1830589,12,주중,1


In [38]:
join_type_df = final_df.groupby(["USER_ID","MONTH","JOIN_DATE"]).sum()
display(join_type_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,COUNT
USER_ID,MONTH,JOIN_DATE,Unnamed: 3_level_1
224,01,2013,1
224,12,2013,1
232,01,2013,4
232,02,2013,2
232,03,2013,2
...,...,...,...
1830551,12,2020,1
1830570,12,2020,1
1830580,12,2020,1
1830589,12,2020,1


In [35]:
final_df.groupby(["USER_ID","MONTH","AD1_TYPE"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,COUNT
USER_ID,MONTH,AD1_TYPE,Unnamed: 3_level_1
224,01,SEOUL,1
224,12,SEOUL,1
232,01,SEOUL,4
232,02,SEOUL,2
232,03,SEOUL,2
...,...,...,...
1830551,12,SEOUL,1
1830570,12,NOT_SEOUL,1
1830580,12,SEOUL,1
1830589,12,SEOUL,1


In [None]:
join_type_df = final_df.groupby(["USER_ID","MONTH","JOIN_DATE"]).sum()

## 우선 시도만 해보자

In [3]:
final_df = pd.read_csv("./data/final.csv")
final_df.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
display(final_df.head())

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,AD1,MONTH,AD1_TYPE,DAY_TYPE
0,2858,2014,AA,A,2020-01-01,1,GN,1,SEOUL,공휴일
1,5647,2014,BB,A,2020-01-01,1,J,1,SEOUL,공휴일
2,33314,2014,BB,A,2020-01-01,1,SC,1,SEOUL,공휴일
3,37001,2014,BB,C,2020-01-01,1,MP,1,SEOUL,공휴일
4,37819,2014,AA,C,2020-01-01,1,JRR,1,SEOUL,공휴일


In [5]:
# 아람님 데이터 붙이기
weather_df = pd.read_csv("./data/기상청.csv")
weather_df.drop("Unnamed: 0",axis=1,inplace=True)
weather_df.loc[365] = ['2020-12-31', 0, 1]  # 12월 30일과 온도가 같기 때문에 동일하게 추가
display(weather_df)
weather_df.info()

Unnamed: 0,일자,강수여부,이상기온
0,2020-01-01,1,0
1,2020-01-02,0,0
2,2020-01-03,0,0
3,2020-01-04,0,0
4,2020-01-05,0,0
...,...,...,...
361,2020-12-27,0,0
362,2020-12-28,0,0
363,2020-12-29,0,0
364,2020-12-30,0,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 366 entries, 0 to 365
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   일자      366 non-null    object
 1   강수여부    366 non-null    int64 
 2   이상기온    366 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 11.4+ KB


In [6]:
# DATE와 일자를 기준으로 붙여봅시당!
final_df = pd.merge(final_df, weather_df, how="outer", left_on="DATE", right_on="일자")

In [7]:
# 잘 붙었는지 확인
display(final_df)

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,AD1,MONTH,AD1_TYPE,DAY_TYPE,일자,강수여부,이상기온
0,2858,2014,AA,A,2020-01-01,1,GN,1,SEOUL,공휴일,2020-01-01,1,0
1,5647,2014,BB,A,2020-01-01,1,J,1,SEOUL,공휴일,2020-01-01,1,0
2,33314,2014,BB,A,2020-01-01,1,SC,1,SEOUL,공휴일,2020-01-01,1,0
3,37001,2014,BB,C,2020-01-01,1,MP,1,SEOUL,공휴일,2020-01-01,1,0
4,37819,2014,AA,C,2020-01-01,1,JRR,1,SEOUL,공휴일,2020-01-01,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
878900,1830551,2020,BB,B,2020-12-31,1,GN,12,SEOUL,주중,2020-12-31,0,1
878901,1830570,2020,BB,B,2020-12-31,1,CY,12,NOT_SEOUL,주중,2020-12-31,0,1
878902,1830580,2020,AA,B,2020-12-31,1,JRR,12,SEOUL,주중,2020-12-31,0,1
878903,1830589,2020,BB,B,2020-12-31,1,J,12,SEOUL,주중,2020-12-31,0,1


In [8]:
# 여기서 필요없는 컬럼 드롭
final_df = final_df.drop(["AD1","일자"], axis=1) # "DATE",
display(final_df)

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,GOODS_TYPE,DATE,COUNT,MONTH,AD1_TYPE,DAY_TYPE,강수여부,이상기온
0,2858,2014,AA,A,2020-01-01,1,1,SEOUL,공휴일,1,0
1,5647,2014,BB,A,2020-01-01,1,1,SEOUL,공휴일,1,0
2,33314,2014,BB,A,2020-01-01,1,1,SEOUL,공휴일,1,0
3,37001,2014,BB,C,2020-01-01,1,1,SEOUL,공휴일,1,0
4,37819,2014,AA,C,2020-01-01,1,1,SEOUL,공휴일,1,0
...,...,...,...,...,...,...,...,...,...,...,...
878900,1830551,2020,BB,B,2020-12-31,1,12,SEOUL,주중,0,1
878901,1830570,2020,BB,B,2020-12-31,1,12,NOT_SEOUL,주중,0,1
878902,1830580,2020,AA,B,2020-12-31,1,12,SEOUL,주중,0,1
878903,1830589,2020,BB,B,2020-12-31,1,12,SEOUL,주중,0,1


In [9]:
# JOIN_DATE
join_dummies = pd.get_dummies(final_df['JOIN_DATE'])

# D_TYPE
dtype_dummies = pd.get_dummies(final_df['D_TYPE'])

# GOODS_TYPE
goods_dummies = pd.get_dummies(final_df['GOODS_TYPE'])

# AD1_TYPE
ad1_dummies = pd.get_dummies(final_df['AD1_TYPE'])

# DAY_TYPE 더미변수
day_dummies = pd.get_dummies(final_df['DAY_TYPE'])

In [10]:
check_df = final_df.drop(['JOIN_DATE', 'AD1_TYPE', 'DAY_TYPE'], axis=1) # 'D_TYPE', 'GOODS_TYPE',

In [11]:
display(check_df)

Unnamed: 0,USER_ID,D_TYPE,GOODS_TYPE,DATE,COUNT,MONTH,강수여부,이상기온
0,2858,AA,A,2020-01-01,1,1,1,0
1,5647,BB,A,2020-01-01,1,1,1,0
2,33314,BB,A,2020-01-01,1,1,1,0
3,37001,BB,C,2020-01-01,1,1,1,0
4,37819,AA,C,2020-01-01,1,1,1,0
...,...,...,...,...,...,...,...,...
878900,1830551,BB,B,2020-12-31,1,12,0,1
878901,1830570,BB,B,2020-12-31,1,12,0,1
878902,1830580,AA,B,2020-12-31,1,12,0,1
878903,1830589,BB,B,2020-12-31,1,12,0,1


In [12]:
check_df = pd.concat([check_df, join_dummies, ad1_dummies, day_dummies], axis=1) # dtype_dummies, goods_dummies, 
display(check_df)

Unnamed: 0,USER_ID,D_TYPE,GOODS_TYPE,DATE,COUNT,MONTH,강수여부,이상기온,1970,2013,...,2016,2017,2018,2019,2020,NOT_SEOUL,SEOUL,공휴일,주말,주중
0,2858,AA,A,2020-01-01,1,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,5647,BB,A,2020-01-01,1,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,33314,BB,A,2020-01-01,1,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,37001,BB,C,2020-01-01,1,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,37819,AA,C,2020-01-01,1,1,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878900,1830551,BB,B,2020-12-31,1,12,0,1,0,0,...,0,0,0,0,1,0,1,0,0,1
878901,1830570,BB,B,2020-12-31,1,12,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
878902,1830580,AA,B,2020-12-31,1,12,0,1,0,0,...,0,0,0,0,1,0,1,0,0,1
878903,1830589,BB,B,2020-12-31,1,12,0,1,0,0,...,0,0,0,0,1,0,1,0,0,1


In [13]:
check_df.to_csv("./data/check.csv")