In [1]:
import pandas as pd
import numpy as np

In [2]:
# parse_dates : 'datetime' 컬럼을 python date type 으로 처리하기 위해
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])

print(train.shape)
train.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
# train 데이터에 년/월/일/시간/요일 컬럼 추가
train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-dayofweek"] = train["datetime"].dt.dayofweek

print(train.shape)
train[["datetime", "datetime-year", "datetime-month", "datetime-day", "datetime-hour", "datetime-dayofweek"]].head()

(10886, 17)


Unnamed: 0,datetime,datetime-year,datetime-month,datetime-day,datetime-hour,datetime-dayofweek
0,2011-01-01 00:00:00,2011,1,1,0,5
1,2011-01-01 01:00:00,2011,1,1,1,5
2,2011-01-01 02:00:00,2011,1,1,2,5
3,2011-01-01 03:00:00,2011,1,1,3,5
4,2011-01-01 04:00:00,2011,1,1,4,5


In [4]:
# train 시킬 feature 선언
feature_names = ["season", "holiday", "workingday", "weather",
                 "temp", "atemp", "humidity", "windspeed",
                 "datetime-year", "datetime-hour", "datetime-dayofweek"]

feature_names

['season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'datetime-year',
 'datetime-hour',
 'datetime-dayofweek']

In [36]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_feature = feature_names + ["count"]
sc_train = pd.DataFrame(sc_X.fit_transform(train[sc_feature]))
sc_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.993213,-1.567754,-1.003866,-1.668944,0.990793,-0.969294
1,-1.349864,-0.17149,-1.460672,-0.660178,-1.438907,-1.182421,0.941249,-1.567754,-1.003866,-1.524341,0.990793,-0.836797
2,-1.349864,-0.17149,-1.460672,-0.660178,-1.438907,-1.182421,0.941249,-1.567754,-1.003866,-1.379739,0.990793,-0.880962
3,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.68143,-1.567754,-1.003866,-1.235137,0.990793,-0.985856
4,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.68143,-1.567754,-1.003866,-1.090534,0.990793,-1.052104


In [39]:
X_train = np.array(sc_train.iloc[:, :-1])
X_train[:,5]

array([-1.09273697, -1.18242083, -1.18242083, ..., -0.91395927,
       -0.73518157, -0.82486544])

In [40]:
train_y = train.iloc[:, -1]
train_y.head()

0    5
1    5
2    5
3    5
4    5
Name: datetime-dayofweek, dtype: int64

In [32]:
y = np.array(train_y)
y[:5]

array([16, 40, 32, 13,  1], dtype=int64)

### Gradient Descent

In [33]:
num_epoch = 200000
learning_rate = 0.1

def getUniRandom():
    return np.random.uniform(low=0.0, high=1.0)

w_train = np.array([getUniRandom() for i in range(X_train.shape[1])])
b = getUniRandom()

for epoch in range (num_epoch):
    y_predict = X_train.dot(w_train) + b
    
    error = np.abs(y_predict - y).mean()
    if error < 4:
        break
                
    # ((y_predict - y) * x1).mean() 는 loss function 의 편미분 (공식 확인)
    for idx, w in enumerate(w_train):
        w_train[idx] = w_train[idx] - learning_rate * ((y_predict - y) * X_train[:,idx]).mean() 
        b =  b - learning_rate * (y_predict - y).mean()
 
    if epoch % 5000 == 0:
        print("{0:2} error={1:6f}".format(epoch, error))

print("{0:2} error={1:6f}".format(epoch, error))
print(w_train)
print(b)

 0 error=191.417495
5000 error=106.206404
10000 error=106.206409
15000 error=106.206409
20000 error=106.206409
25000 error=106.206409
30000 error=106.206409
35000 error=106.206409
40000 error=106.206409
45000 error=106.206409
50000 error=106.206409
55000 error=106.206409
60000 error=106.206409
65000 error=106.206409
70000 error=106.206409
75000 error=106.206409
80000 error=106.206409


KeyboardInterrupt: 