In [1]:
import datetime as dt
from datetime import date
import calendar
from calendar import weekday, day_name
import pandas as pd
import numpy as np

## 트레이닝 데이터를 불러옴, datae를 날짜로 불러오기 위하여 parse_dates 활용

In [2]:
df = pd.read_csv('weekday_training.csv', parse_dates=['date'])

### year, month, day 컬럼을 별도 생성. dt.year / month / day 활용

In [12]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month.astype('str')
df['day'] = df['date'].dt.day

# apply lambda를 통해 dt.datetime.strftime 적용 -> weekday를 문자열로 적용
df['weekday'] = df[['date']].apply(lambda x: dt.datetime.strftime(x['date'], '%a'), axis=1)

In [13]:
df

Unnamed: 0,station_nbr,date,year,month,day,weekday,Fri,Mon,Sat,Sun,...,Wed,1,2,3,4,5,6,7,8,9
0,1,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,11,2012-01-01,2012,01,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.head()

Unnamed: 0,station_nbr,date,year,month,day,weekday
0,1,2012-01-01,2012,1,1,Sun
1,2,2012-01-01,2012,1,1,Sun
2,3,2012-01-01,2012,1,1,Sun
3,4,2012-01-01,2012,1,1,Sun
4,6,2012-01-01,2012,1,1,Sun


## 요일별로 dummies 변수 및 컬럼 생성

In [5]:
ls_weekday = list(df.weekday.unique())
token_weekday = 'SunMonTueWedThuFriSat'

def validation_weekday(token_weekday):
    result = []
    for i in ls_weekday:
        if i in token_weekday:
            token_weekday = token_weekday.replace(i, '')
            result.append(i)
    return result
        

cs_iter_weekday = (set(validation_weekday(token_weekday)) for x in df.weekday)
cs_weekday = sorted(set.union(*cs_iter_weekday))

dummies_weekday = pd.DataFrame(np.zeros((len(df), len(cs_weekday))), columns=cs_weekday)
dummies_weekday.head()

Unnamed: 0,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
for i, gen in enumerate(df.weekday):
    dummies_weekday.ix[i, validation_weekday(gen)] = 1
    
df = df.join(dummies_weekday)

df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,station_nbr,date,year,month,day,weekday,Fri,Mon,Sat,Sun,Thu,Tue,Wed
0,1,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,6,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,7,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,8,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,9,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,10,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,11,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 월별 더미변수 및 컬럼 생성 

In [9]:
ls_month = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
token_month = '123456789101112'

def validation_month(token_month):
    result = []
    for i in ls_month:
        if i in token_month:
            token_month = token_month.replace(i, '')
            result.append(i)
    return result
        

cs_iter = (set(validation_month(token_month)) for x in df.month)
cs = sorted(set.union(*cs_iter))

dummies_month = pd.DataFrame(np.zeros((len(df), len(cs))), columns=cs)
dummies_month.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
for i, gen in enumerate(df.month):
    dummies_month.ix[i, validation_month(gen)] = 1
    
df = df.join(dummies_month)

df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,station_nbr,date,year,month,day,weekday,Fri,Mon,Sat,Sun,...,Wed,1,2,3,4,5,6,7,8,9
0,1,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,11,2012-01-01,2012,1,1,Sun,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
