In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
submission = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
train.head()

In [None]:
test.head()

## 데이터의 기본 구조 확인, 결측치 확인, 컬럼별 정보 확인

In [None]:
train.info()

In [None]:
# 결측치 확인
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.corr()

In [None]:
# 상관관계 시각화
plt.figure(figsize=(12, 12))
sns.heatmap(train.corr(), annot=True)

## datetime 컬럼 활용

In [None]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second
train['dayofweek'] = train['datetime'].dt.dayofweek

In [None]:
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['minute'] = test['datetime'].dt.minute
test['second'] = test['datetime'].dt.second
test['dayofweek'] = test['datetime'].dt.dayofweek

In [None]:
train.head()

In [None]:
train.groupby('season')['count'].mean().plot(kind='bar')

In [None]:
import seaborn as sns

In [None]:
sns.countplot(train['month'])

In [None]:
sns.barplot(x='month', y='count', hue='year', data=train)

In [None]:
sns.boxplot(x='month', y='count', hue='year', data=train)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 8))
sns.pointplot(x='hour', y='count', hue='dayofweek', data=train)

In [None]:
plt.figure(figsize=(12, 8))
sns.pointplot(x='hour', y='registered', data=train)

In [None]:
plt.figure(figsize=(12, 8))
sns.pointplot(x='hour', y='casual', data=train)

In [None]:
train.columns

In [None]:
sorted(train['day'].value_counts().keys())

In [None]:
sorted(test['day'].value_counts().keys())

## interpolation
[interpolation 예제](https://teddylee777.github.io/pandas/pandas-interpolation
)

In [None]:
train.loc[train['windspeed'] == 0, 'windspeed'] = np.nan
test.loc[test['windspeed'] == 0, 'windspeed'] = np.nan

In [None]:
train['windspeed'].isnull().sum()

In [None]:
test['windspeed'].isnull().sum()

In [None]:
train['windspeed'] = train['windspeed'].interpolate().fillna(train['windspeed'].median())

In [None]:
test['windspeed'] = test['windspeed'].interpolate().fillna(test['windspeed'].median())

In [None]:
plt.plot(test.loc[:1000, 'windspeed'])

In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(train['count'])

In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(np.log(train['count']))

In [None]:
a = 10

In [None]:
b = np.log1p(a)
b

In [None]:
np.expm1(b)

In [None]:
cols = [
    #'datetime', 
    'season', 
    'holiday', 
    'workingday', 
    'weather', 
    'temp',
    'atemp',  # atemp 제거할 예정: 상관관계가 0.98로 상당히 높음. 다중공성선 이슈로 임의 제거함.
    'humidity', 
    'windspeed',
    'year', 
    'month', 
#     'day', 
    'hour', 
    'dayofweek',
]

In [None]:
# labels = [
#     'count'
# ]

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [None]:
train[cols]

In [None]:
train[labels]

In [None]:
# 모델(RandomForest) 학습, 비회원(casual)을 예측하는 모델 생성
model1 = RandomForestRegressor(n_estimators=300)
model1.fit(train[cols], np.log1p(train['casual']))

In [None]:
# 모델(RandomForest) 학습, 회원(registered)을 예측하는 모델 생성
model2 = RandomForestRegressor(n_estimators=300)
model2.fit(train[cols], np.log1p(train['registered']))

In [None]:
# 비회원, 회원의 자전거 대여량을 각각 예측 후에 합산 예정
# 모델을 활용한 예측 코드
pred1 = model1.predict(test[cols])
pred2 = model2.predict(test[cols])

In [None]:
pred = np.expm1(pred1) + np.expm1(pred2)

In [None]:
# 정답 파일에 기입
submission['count'] = pred

In [None]:
submission['count']

In [None]:
submission.to_csv('08-submission.csv', index=False)