In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns #시각화를 위한 라이브러리
import matplotlib.pyplot as plt
import calendar 
from datetime import datetime

from sklearn.ensemble import RandomForestRegressor

In [None]:
train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

In [None]:
train.head()

~~~
datetime - hourly date + timestamp  
season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals
~~~

In [None]:
train.info()

In [None]:
# 자세한 분석을 위해 datetime을 날짜와 시간으로 분리

train["tempDate"] = train.datetime.apply(lambda x:x.split())
train['tempDate']

~~~
calendar.day_name[]
해당 함수는 주어진 데이터의 요일을 나타내는 배열
datetime.strptime()
날짜, 시간형식의 문자열을 datetime으로 변경하는 함수.
datetime.weekday()
요일을 반환. 0:월 ~ 6:일
~~~

In [None]:
# 날짜를 연,월,일로 나눔
train['year'] = train.tempDate.apply(lambda x:x[0].split('-')[0])
train['month'] = train.tempDate.apply(lambda x:x[0].split('-')[1])
train['day'] = train.tempDate.apply(lambda x:x[0].split('-')[2])

# 시간대와 요일을 분리함.
train['hour'] = train.tempDate.apply(lambda x:x[1].split(':')[0])
train['weekday'] = train.tempDate.apply(lambda x:calendar.day_name[datetime.strptime(x[0],"%Y-%m-%d").weekday()])

#train['weekday']에는 해당하는 요일이 저장됨.

In [None]:
# 추출된 데이터는 문자열이므로 숫자로 변경해준다.
# errors='coerce'는 모든 non-numeric 값을 NaN으로 변경한다는 의미

train['year'] = pd.to_numeric(train.year,errors='coerce')
train['month'] = pd.to_numeric(train.month,errors='coerce')
train['day'] = pd.to_numeric(train.day,errors='coerce')
train['hour'] = pd.to_numeric(train.hour,errors='coerce')

In [None]:
train.info()

In [None]:
fig = plt.figure(figsize=(12,10))

# year - count
ax1 = fig.add_subplot(2,2,1)
ax1 = sns.barplot(x = 'year', y = 'count', data = train.groupby('year')['count'].mean().reset_index())

#month - count
ax2 = fig.add_subplot(2,2,2)
ax2 = sns.barplot(x='month',y='count',data=train.groupby('month')['count'].mean().reset_index())

#day - count
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.barplot(x='day',y='count',data=train.groupby('day')['count'].mean().reset_index())

#hour - count
ax4 = fig.add_subplot(2,2,4)
ax4 = sns.barplot(x='hour',y='count',data=train.groupby('hour')['count'].mean().reset_index())

In [None]:
fig = plt.figure(figsize=[12,10])

# season - count
ax1 = fig.add_subplot(2,2,1)
ax1 = sns.barplot(x='season',y='count',data=train.groupby('season')['count'].mean().reset_index())

#w holiday - count
ax2 = fig.add_subplot(2,2,2)
ax2 = sns.barplot(x='holiday',y='count',data=train.groupby('holiday')['count'].mean().reset_index())

# workingday - count
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.barplot(x='workingday',y='count',data=train.groupby('workingday')['count'].mean().reset_index())

# weather - count
ax4 = fig.add_subplot(2,2,4)
ax4 = sns.barplot(x='weather',y='count',data=train.groupby('weather')['count'].mean().reset_index())

In [None]:
# distplot을 그리기 위해선 적절한 bins값을 설정하는 것이 중요하다.
# 아래에서는 적절한 bins를 결정하기 위해 각 속성의 range(최소값, 최대값)을 bins값으로 설정하였다.
# 참고 코드에선 astype("int")를 하였는데 float에 적용될 수 없다는 에러가 발생하여 int()로 변경함.

fig = plt.figure(figsize = (12,10))

# temp - count
ax1 = fig.add_subplot(2,2,1)
ax1 = sns.distplot(train.temp,bins=range(int(train.temp.min()),int(train.temp.max()+1)))
# atemp - count
ax2 = fig.add_subplot(2,2,2)
ax2 = sns.distplot(train.atemp,bins=range(int(train.atemp.min()),int(train.atemp.max()+1)))

# humidity - count
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.distplot(train.humidity,bins=range(int(train.humidity.min()),int(train.humidity.max()+1)))

# windspeed - count
ax4 = fig.add_subplot(2,2,4)
ax4 = sns.distplot(train.windspeed,bins=range(int(train.windspeed.min()),int(train.windspeed.max()+1)))

In [None]:
# heatmap으로 변경하기 위한 상관계수 계산은 train.corr()를 사용함.

fig = plt.figure(figsize=[20,20])
ax = sns.heatmap(train.corr(),annot=True,square=True)

In [None]:
fig = plt.figure(figsize=[12,10])

#시간과 계절에 따른 count
ax1 = fig.add_subplot(2,2,1)
ax1 = sns.pointplot(x='hour',y='count',hue='season',data=train.groupby(['season','hour'])['count'].mean().reset_index())

#시간과 휴일 여부에 따른 count
ax2 = fig.add_subplot(2,2,2)
ax2 = sns.pointplot(x='hour',y='count',hue='holiday',data=train.groupby(['holiday','hour'])['count'].mean().reset_index())

#시간과 요일에 따른 count
ax3 = fig.add_subplot(2,2,3)
ax3 = sns.pointplot(x='hour',y='count',hue='weekday',hue_order=['Sunday','Monday','Tuesday','Wendnesday','Thursday','Friday','Saturday'],data=train.groupby(['weekday','hour'])['count'].mean().reset_index())

#시간과 날씨에 따른 count
ax4 = fig.add_subplot(2,2,4)
ax4 = sns.pointplot(x='hour',y='count',hue='weather',data=train.groupby(['weather','hour'])['count'].mean().reset_index())

데이터 확인 결과 windspeed에서 0값이 많이 발생함. </br>
바람이 0인 경우는 거의 없는데 이러한 결과가 발생 -> 데이터를 활용해 windspeed 값을 변경해줘야 함.

In [None]:
# 머신러닝 모델은 문자열값으로 훈련시킬 수 없기 떄문에 문자열을 카테고리화 하여 정수로 변환시킴
# 0 : sunday ~ 6 : saturday

train['weekday'] = train.weekday.astype('category')
print(train['weekday'])

In [None]:
train.weekday.cat.categories = ['5','1','6','0','4','2','3']
print(train['weekday'])

In [None]:
# float값인 temp와 atemp를 int로 변경함. 그래도 안돼서 삭제하고 돌려봤는데 동일한 에러 발생.

# random forest model로 windspeed값 부여
# train 데이터를 windspeed가 0인 데이터 프레임과 0이 아닌 데이터 프레임으로 분리
# windspeed가 0이 아닌 데이터 프레임은 windspeed series와 이외의 학습 column 데이터프레임으로 분리
# windspeed가 0이 아닌 데이터로 학습을 시킨다음 비교하여 windspeed가 0인 값을 변경

# windspeed가 0인 DF
windspeed_0 = train[train.windspeed == 0]
#windspeed가 0이 아닌 DF
windspeed_not0 = train[train.windspeed != 0]

#windspeed_0_df = windspeed_0.drop(['windspeed','casual','registered','count','datetime'],axis=1)

features = ['season','holiday','workingday','weather', 'temp', 'atemp', 'humidity']
x = pd.get_dummies(windspeed_not0[features])
x_test = pd.get_dummies(windspeed_0[features])
y = windspeed_not0['windspeed']

#windspeed_not0_df = windspeed_not0.drop(['windspeed','casual','registered','count','datetime'],axis=1)
#windspeed_not0_series = windspeed_not0['windspeed'] 

rf = RandomForestRegressor()
#rf.fit(windspeed_not0_df,windspeed_not0_series)
rf.fit(x, y)
predicted_windspeed_0 = rf.predict(x_test)
# predicted_windspeed_0 = rf.predict(windspeed_0_df)

windspeed_0['windspeed'] = predicted_windspeed_0

In [None]:
train = pd.concat([windspeed_0, windspeed_not0], axis = 0)

In [None]:
# 시간 순서로 정렬

train.datetime = pd.to_datetime(train.datetime, errors='coerce')
train = train.sort_values(by=['datetime'])
train['datetime']

In [None]:
# windspeed 수정 후 상관관계 다시 분석

fig = plt.figure(figsize=[20,20])
ax = sns.heatmap(train.corr(),annot=True,square=True)

train과 test 동시에 전처리 진행

In [None]:
train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

combine = pd.concat([train,test],axis=0)

combine['tempDate'] = combine.datetime.apply(lambda x:x.split())
combine['weekday'] = combine.tempDate.apply(lambda x: calendar.day_name[datetime.strptime(x[0],"%Y-%m-%d").weekday()])
combine['year'] = combine.tempDate.apply(lambda x: x[0].split('-')[0])
combine['month'] = combine.tempDate.apply(lambda x: x[0].split('-')[1])
combine['day'] = combine.tempDate.apply(lambda x: x[0].split('-')[2])
combine['hour'] = combine.tempDate.apply(lambda x: x[1].split(':')[0])

combine['year'] = pd.to_numeric(combine.year,errors='coerce')
combine['month'] = pd.to_numeric(combine.month,errors='coerce')
combine['day'] = pd.to_numeric(combine.day,errors='coerce')
combine['hour'] = pd.to_numeric(combine.hour,errors='coerce')

combine.weekday = combine.weekday.astype('category')
combine.weekday.cat.categories = ['5','1','6','0','4','2','3']

data_wind0 = combine[combine['windspeed']==0]
data_wind_not0 = combine[combine['windspeed']!=0]

features = ['season','holiday','workingday','weather', 'temp', 'atemp', 'humidity']
x2 = pd.get_dummies(data_wind_not0[features])
x2_test = pd.get_dummies(data_wind0[features])
y2 = data_wind_not0['windspeed']

#windspeed_not0_df = windspeed_not0.drop(['windspeed','casual','registered','count','datetime'],axis=1)
#windspeed_not0_series = windspeed_not0['windspeed'] 

rf2 = RandomForestRegressor()
#rf.fit(windspeed_not0_df,windspeed_not0_series)
rf2.fit(x2, y2)
predicted = rf.predict(x2_test)
# predicted_windspeed_0 = rf.predict(windspeed_0_df)

data_wind0['windspeed'] = predicted

train = pd.concat([data_wind0, data_wind_not0], axis = 0)


In [None]:
categorizational_columns = ['holiday','humidity','season','weather','workingday','year','month','day','hour']
drop_columns = ['datetime','casual','registered','count','tempDate']

In [None]:
for col in categorizational_columns:
    combine[col] = combine[col].astype('category')

In [None]:
train = combine[pd.notnull(combine['count'])].sort_values(by='datetime')
test = combine[~pd.notnull(combine['count'])].sort_values(by='datetime')

#데이터 훈련시 집어 넣게 될 각각의 결과 값들
datetimecol = test['datetime']
yLabels = train['count'] #count
yLabelsRegistered = train['registered'] #등록된 사용자
yLabelsCasual = train['casual'] #임시 사용자

In [None]:
train = train.drop(drop_columns,axis=1)
test = test.drop(drop_columns,axis=1)

In [None]:
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y), 
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso


lr = LinearRegression()

yLabelslog = np.log1p(yLabels)
lr.fit(train,yLabelslog)
preds = lr.predict(train)
print('RMSLE Value For Linear Regression: {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))

In [None]:
predict_test = lr.predict(test)
submission = pd.DataFrame({
        "datetime": datetimecol,
        "count": [max(0, x) for x in np.exp(predict_test)]
    })
submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)