In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libary

In [None]:
import os
import sys
import numpy as np
import pandas as pd

import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
import matplotlib.pyplot as plt

import datetime

from IPython.display import display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option("display.max_colwidth", 10000)
pd.set_option("display.float_format", lambda x: "%.2f" % x)
    
plt.rcParams['font.size'] = 15

# Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
df_test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

In [None]:
print(df_train.shape)
print(df_test.shape)

# Data cleaning

## 1. 関係ない項目を除く
* df_train : casual, registered, atemp
*df_test : atemp

In [None]:
df_train = df_train.drop(['atemp', 'casual', 'registered'], axis=1)
df_test = df_test.drop(['atemp'], axis=1)

print(df_train.shape)
print(df_test.shape)

## 2. datetimeについて

In [None]:
df_train['datetime'] = pd.to_datetime(df_train['datetime']) 
df_test['datetime'] = pd.to_datetime(df_test['datetime'])

In [None]:
df_train['year'] = df_train['datetime'].apply(lambda x: x.strftime('%Y')).astype('category')
df_train['month']= df_train['datetime'].apply(lambda x: x.strftime('%b')).astype('category')
df_train['day']  = df_train['datetime'].apply(lambda x: x.strftime('%d')).astype('category')
df_train['hour'] = df_train['datetime'].apply(lambda x: x.strftime('%H')).astype('category')

In [None]:
df_test['year'] = df_test['datetime'].apply(lambda x: x.strftime('%Y')).astype('category')
df_test['month']= df_test['datetime'].apply(lambda x: x.strftime('%b')).astype('category')
df_test['day']  = df_test['datetime'].apply(lambda x: x.strftime('%d')).astype('category')
df_test['hour'] = df_test['datetime'].apply(lambda x: x.strftime('%H')).astype('category')

In [None]:
print(df_train.shape)
print(df_test.shape)

## 3. 外れ値

In [None]:
df_train['weather'] = df_train['weather'].apply(lambda x: 3 if x > 3 else x)
df_test['weather'] = df_train['weather'].apply(lambda x: 3 if x>3 else x)

## 4.データタイプ変換

In [None]:
cat_feats = ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'hour']
num_feats = ['temp', 'humidity', 'windspeed']
target = ['count']

df_train[cat_feats] = df_train[cat_feats].astype('category')
df_test[cat_feats] = df_test[cat_feats].astype('category')

## 5.　特徴量の選別

In [None]:
cat_feats = ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'hour']
num_feats = ['temp', 'humidity', 'windspeed']
#target = ['count']

used_feats = cat_feats + num_feats 

# Train,validationセット作成

In [None]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(df_train, test_size=0.2, random_state=42)

print(train_set.shape)
print(valid_set.shape)

# 学習準備

In [None]:
X_train = train_set[used_feats].copy()
y_train = train_set['count'].copy()     
X_valid = valid_set[used_feats].copy()
y_valid = valid_set['count'].copy()

print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        #('imputer', SimpleImputer(strategy='constant', fill_value=0)), //今回欠損値がない
        #('attribs_adder', CombinedAttributesAdder()) // これを追加することでFeatureの重要性を示す
        ('std_scaler', StandardScaler()), 
    ])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_feats),
    ('cat', OneHotEncoder(), cat_feats)
])

In [None]:
X_train_prepared = full_pipeline.fit_transform(X_train)
y_trainl = np.log1p(y_train)

# モデリング

## DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tr = DecisionTreeRegressor()
tr.fit(X_train_prepared, y_trainl)

In [None]:
from sklearn.metrics import mean_squared_log_error

train_preds = tr.predict(X_train_prepared)
tr_msle = mean_squared_log_error(y_trainl, train_preds)
tr_rmsle = np.sqrt(tr_msle)
tr_rmsle

In [None]:
X_valid_prepared = full_pipeline.transform(X_valid)
y_validl = np.log1p(y_valid)

In [None]:
valid_preds = tr.predict(X_valid_prepared)
tr_msle_valid = mean_squared_log_error(y_validl, valid_preds)
tr_rmsle_valid = np.sqrt(tr_msle_valid)
tr_rmsle_valid

# Submission

In [None]:
X_test_prepared = full_pipeline.transform(df_test[used_feats])

In [None]:
preds = tr.predict(X_test_prepared)
preds = np.expm1(preds).astype(int)

In [None]:
submission_dt = df_test.loc[:, 'datetime']
submission = pd.DataFrame({"datetime":submission_dt, "count":preds})

In [None]:
submission.to_csv("decisionTreereg_v1.csv", index = False)