In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 데이터 전처리

In [None]:
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
submit = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')

In [None]:
train.head()

In [None]:
train.info()

### 결측치 없음

In [None]:
import datetime
train['datetime']= pd.to_datetime(train['datetime'])

In [None]:
train['year']= train['datetime'].dt.year
train['month']= train['datetime'].dt.month
train['day']= train['datetime'].dt.day
train['weekday']= train['datetime'].dt.weekday

train['hour']= train['datetime'].dt.hour

In [None]:
train.tail()

In [None]:
train.head()

In [None]:
train.sample()

In [None]:
train.duplicated()

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
test.info()

### test data에는 casual, registered, count(target) column이 없음

In [None]:
test.describe()

In [None]:
test.duplicated()

In [None]:
test['datetime']= pd.to_datetime(test['datetime'])
test['year']= test['datetime'].dt.year
test['month']= test['datetime'].dt.month
test['day']= test['datetime'].dt.day
test['weekday']= test['datetime'].dt.weekday
test['hour']= test['datetime'].dt.hour

In [None]:
test.info()

In [None]:
test.head()

In [None]:
#train data에서 불필요한 column 제거
train = train.drop(['casual', 'registered'],axis=1)

In [None]:
train.head()

In [None]:
train

# 2. 데이터 시각화(EDA)

#### profiling tool로 시각화 1차 확인

In [None]:
import pandas_profiling as pp
pp.ProfileReport(train)

*상관관계? 
- 일반적으로 두 변수가 함께 변화하는 직선적 관계, 
- 상관계수(correlation coefficient) 'r'을 구하여 상관정도를 나타냄, 상관계수가 클수록 두 변수 사이에 직선적 관계가 강하다는 것을 의미함

*상관관계 분석 결과
- workingday, weekday 전혀 상관없음
- temp, atemp 상관관계 당연히 높음
- season, month 상관관계 높음
- weather, humidity 관련성 있어보임
- count(종속변수)에 상관관계 높은 독립변수는 temp, atemp, hour로 3가지  


* temp 10~30 일 때 count(total rentals) 높음
* hour 5-10시, 15-20시에 count 높음 --> 출퇴근 시간에 대여 빈도 높은듯


In [None]:
train.corr()

#### count 이상치(outlier) 제거

In [None]:
import seaborn as sns
sns.boxplot(train['count'])

In [None]:
# |실제값-평균|값이 표준편차*3 인 행만 추출
train = train[np.abs(train["count"]-train["count"].mean())<=(3*train["count"].std())]

## 범주형 변수

In [None]:
train['weekday']

In [None]:
import matplotlib.pyplot as plt
fig,axes = plt.subplots(ncols=2 ,nrows=3)
fig.set_size_inches(15,10)
sns.boxplot(data=train,x='season',y='count',ax=axes[0][0])
sns.boxplot(data=train,x='holiday',y='count',ax=axes[0][1])
sns.boxplot(data=train,x='workingday',y='count',ax=axes[1][0])
sns.boxplot(data=train,x='weather',y='count',ax=axes[1][1])

fig1,axes1 = plt.subplots()
fig1.set_size_inches(15,10)
sns.boxplot(data=train,x='hour',y='count')

fig2,axes2 = plt.subplots()
fig1.set_size_inches(15,10)
sns.boxplot(data=train,x='weekday',y='count')
fig2,axes2 = plt.subplots()
fig1.set_size_inches(15,10)
sns.boxplot(data=train,x='weekday',y='count')

#### 봄에 비교적 count 적음
#### workingday = 1 에 outlier 비교적 많음
#### weather = 1(맑음)에 count 높음
#### 아침 8시, 오후 5시에 count 높음

## 연속형 변수

In [None]:
numeric = ["temp","atemp","humidity","windspeed","count","year","month","day","hour"]
plt.subplots(figsize=(15,8))
sns.heatmap(train[numeric].corr(),annot=True)

In [None]:
### EDA 참고한 부분
#### count,month
plt.figure(figsize=(9,6))
monthagg = pd.DataFrame(train.groupby('month')['count'].mean()).reset_index()
sns.barplot(data=monthagg, x='month',y='count').set(title = 'Count by Month')

In [None]:
### count,season,hour
plt.figure(figsize=(9,6))
houragg = pd.DataFrame(train.groupby(['hour','season'])['count'].mean()).reset_index()
sns.pointplot(data=houragg,x=houragg['hour'],y=houragg['count'],hue=houragg['season']).set(title='Count by Hour, Season')

In [None]:
### count,hour,weekday
plt.figure(figsize=(9,6))
hourweekagg = pd.DataFrame(train.groupby(['hour','weekday'])['count'].mean()).reset_index()
sns.pointplot(data=hourweekagg,x=hourweekagg['hour'],y=hourweekagg['count'],hue=hourweekagg['weekday']).set(title='Count by Hour,Weekday')

# 4. 학습 및 평가 (모델링)

In [None]:
train = train.drop(['datetime'], axis=1)
test = test.drop(['datetime'],axis=1)

In [None]:
#feature, target 분리
from sklearn.model_selection import train_test_split
X = train.drop(['count'],axis=1)
y = train['count']


In [None]:
import xgboost as xg
xgr=xg.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4,colsample_bytree=0.6,subsample=0.6)
xgr.fit(X,y)

# 5. 예측

In [None]:
y_output=xgr.predict(test)
y_output

In [None]:
submit['count']=y_output

In [None]:
submit.to_csv('xgb-kiy.csv',index=False)

In [None]:
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

In [None]:
# #앙상블 모델 학습
# from sklearn.ensemble import RandomForestRegressor,VotingRegressor,StackingRegressor
# xgb = XGBRegressor(n_estimators=700,
#                   max_depth=7,
#                   learning_rate=0.05)
# xgb.fit(x_train,y_train)

# lgb = LGBMRegressor(n_estimators=600,
#                    max_depth=6,
#                    learning_rate=0.1)
# lgb.fit(x_train,y_train)

# ctb = CatBoostRegressor(n_estimators=500,
#                    max_depth=5,
#                    learning_rate=0.1,verbose=0)
# ctb.fit(x_train,y_train)

# ensemble = [('xgb',xgb),('lgb',lgb),('ctb',ctb)]
# voting_gb = VotingRegressor(ensemble).fit(x_train,y_train)

In [None]:
# names_gb = ['xgb','lgb','ctb','gb ensemble']
# scores_gb = []
# for counter,i in enumerate([xgb,lgb,ctb,voting_gb]):
#     scores_gb.append(np.log( mean_squared_error(y_test,i.predict(x_test)) ))
    
# tmp = pd.DataFrame(scores_gb,names_gb).T

# tmp