In [None]:
import pandas as pd

## Load Dataset

In [None]:
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])

# train["datetime"].dt.year # (month / day / hour / minute / second)

print(train.shape)
train.head()

In [None]:
test = pd.read_csv("data/test.csv", parse_dates=["datetime"])

print(test.shape)
test.head()

## Preprocessing

### Parse datetime

In [None]:
train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-minute"] = train["datetime"].dt.minute
train["datetime-second"] = train["datetime"].dt.second
train["datetime-dayofweek"] = train["datetime"].dt.dayofweek

print(train.shape)
train.head()

In [None]:
test["datetime-year"] = test["datetime"].dt.year
test["datetime-month"] = test["datetime"].dt.month
test["datetime-day"] = test["datetime"].dt.day
test["datetime-hour"] = test["datetime"].dt.hour
test["datetime-minute"] = test["datetime"].dt.minute
test["datetime-second"] = test["datetime"].dt.second
test["datetime-dayofweek"] = test["datetime"].dt.dayofweek

print(test.shape)
test.head()

## Explore

In [None]:
%matplotlib inline
import seaborn as sns

### datetime

In [None]:
import matplotlib.pyplot as plt

figure, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3)
figure.set_size_inches(18, 8)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)
sns.barplot(data=train, x="datetime-day", y="count", ax=ax3)
sns.barplot(data=train, x="datetime-hour", y="count", ax=ax4)
sns.barplot(data=train, x="datetime-minute", y="count", ax=ax5)
sns.barplot(data=train, x="datetime-second", y="count", ax=ax6)

In [None]:
# str == string == 문자열
train["datetime-year_month"] = train["datetime-year"].astype('str') + "-" + train["datetime-month"].astype('str')

print(train.shape)
train[["datetime", "datetime-year_month"]].head()

In [None]:
import matplotlib.pyplot as plt

figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)

figure, ax3 = plt.subplots(nrows=1, ncols=1)
figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year_month", y="count", ax=ax3)

In [None]:
figure, ax1 = plt.subplots(nrows=1, ncols=1)
figure.set_size_inches(18, 4)

sns.pointplot(data=train, x="datetime-hour", y="count", hue="workingday", ax=ax1)

figure, ax2 = plt.subplots(nrows=1, ncols=1)
figure.set_size_inches(18, 4)

sns.pointplot(data=train, x="datetime-hour", y="count", hue="datetime-dayofweek", ax=ax2)

## Train

In [None]:
feature_names = ["season", "holiday", "workingday", "weather",
                 "temp", "atemp", "humidity", "windspeed",
                 "datetime-year", "datetime-hour", "datetime-dayofweek"]
feature_names

In [None]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

In [None]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

In [None]:
label_name = "count"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=37)
model

### Score

Evaluation Metric = **Root Mean Squared Logarithmic Error** (RMSLE)
$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [None]:
import numpy as np
from sklearn.metrics import make_scorer

def rmsle(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    difference = np.square(difference)
    
    mean_difference = difference.mean()
    
    score = np.sqrt(mean_difference)
    
    return score

rmsle_scorer = make_scorer(rmsle)
rmsle_scorer

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(model, X_train, y_train,
                        cv=20, scoring=rmsle_scorer).mean()

print("Score = {0:.5f}".format(score))

In [None]:
# fit(X_train, y_train) / predict(X_test)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

## Submit

In [None]:
submit = pd.read_csv("data/sampleSubmission.csv")

print(submit.shape)
submit.head()

In [None]:
submit["count"] = predictions

print(submit.shape)
submit.head()

In [None]:
submit.to_csv("data/baseline-script.csv", index=False)