In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

bicycle = pd.read_csv("./data/train.csv")
bicycle.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833


In [3]:
def seperate_datetime(dataframe):
    year = []
    month = []
    day = []

    for date in dataframe.date_time:
        year_point, month_point, day_point = date.split('-') # - 기준으로 string을 나누고 list로 만듦 ex) '2016-04-01' -> ['2016', '04', '01']
        year.append(int(year_point)-2017)
        month.append(int(month_point)-3)
        day.append(int(day_point))
    return year, month, day

year, month, day = seperate_datetime(bicycle)

bicycle['year'] = year
bicycle['month'] = month
bicycle['day'] = day

bicycle.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals,year,month,day
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994,1,1,1
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139,1,1,2
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817,1,1,3
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034,1,1,4
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833,1,1,5


In [4]:
X  = bicycle.drop(['date_time', 'number_of_rentals'], axis=1)
y = bicycle.number_of_rentals
X

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day
0,207.500,4.000,0.000,3.050,75.000,12.600,21.000,30.000,1,1,1
1,208.317,2.950,0.000,3.278,69.833,12.812,19.000,19.500,1,1,2
2,213.516,2.911,0.000,2.690,74.879,10.312,15.316,19.113,1,1,3
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4
4,95.905,4.000,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
268,228.662,3.980,0.223,2.271,78.378,20.500,27.526,36.486,3,3,26
269,207.770,2.865,0.081,1.794,78.412,20.812,28.842,21.081,3,3,27
270,282.568,1.730,0.000,1.820,72.736,21.000,29.053,7.297,3,3,28
271,137.027,2.257,0.088,2.043,70.473,19.625,26.000,15.541,3,3,29


In [5]:
# metric 정의

import numpy as np

def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression() # 모델 정의
model.fit(X, y) # 학습

y_hat = model.predict(X) # y 예측

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

모델 NMAE: 0.323479752120145


In [7]:
# 1번은 사람 손으로 만든 feature들을 사용합니다.
X_human = X.copy()

In [8]:
from sklearn.preprocessing import LabelEncoder

# 요일 정보 추가
week_day = pd.to_datetime(bicycle['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
X_human['week_day'] = le.transform(week_day)
X_human.head()

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day,week_day
0,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,1,1,1,3
1,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,1,1,2,1
2,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,1,1,3,5
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4,6
4,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5,4


In [9]:
# 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
X_human['temp_diff_info'] = X_human['high_temp'] - X_human['low_temp']

# 2. "덥고 습한 날씨"를 알기 위한 정보
X_human['sweat_info'] = X_human['high_temp'] * X_human['humidity'] 

# 3. "춥고 바람부는 날씨"를 알기 위한 정보
X_human['cold_info'] = X_human['low_temp'] * X_human['wind_speed'] 

X_human.head()

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day,week_day,temp_diff_info,sweat_info,cold_info
0,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,1,1,1,3,8.4,1575.0,38.43
1,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,1,1,2,1,6.188,1326.827,41.997736
2,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,1,1,3,5,5.004,1146.846764,27.73928
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4,6,4.056,888.628432,26.083056
4,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5,4,4.546,768.903064,18.71775


In [11]:
## 1번 모델 성능 확인

model = LinearRegression() # 모델 정의
model.fit(X_human, y) # 학습

y_train = pd.read_csv('./data/test.csv')
y_test = pd.read_csv('./data/y_train.csv', encoding='cp949')
y_tes

score = NMAE(y_test, model.predict(y_test))

print(f'모델 NMAE: {score}')

ValueError: could not convert string to float: '2021-04-01'

In [12]:
y_test

Unnamed: 0,대여일시,대여건수,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,2021-04-01,116640,,,,
1,2021-04-02,110607,,,,
2,2021-04-03,14026,,,,
3,2021-04-04,94160,,,,
4,2021-04-05,110533,,,,
...,...,...,...,...,...,...
86,2021-06-26,105813,,,,
87,2021-06-27,92869,,,,
88,2021-06-28,106378,,,,
89,2021-06-29,111907,,,,
