In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv("../input/bike-sharing-demand/train.csv")
test_df = pd.read_csv("../input/bike-sharing-demand/test.csv")

train_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df['datetime'] = pd.to_datetime(train_df['datetime'])
train_df['year'] = train_df['datetime'].apply(lambda x : x.year)
train_df['month'] = train_df['datetime'].apply(lambda x : x.month)
train_df['day'] = train_df['datetime'].apply(lambda x : x.day)
train_df['hour'] = train_df['datetime'].apply(lambda x : x.hour)

test_df['datetime'] = pd.to_datetime(test_df['datetime'])
test_df['year'] = test_df['datetime'].apply(lambda x : x.year)
test_df['month'] = test_df['datetime'].apply(lambda x : x.month)
test_df['day'] = test_df['datetime'].apply(lambda x : x.day)
test_df['hour'] = test_df['datetime'].apply(lambda x : x.hour)

train_df.drop(['datetime', 'casual', 'registered'], axis = 1, inplace = True)
test_df_datetime = test_df['datetime']
test_df.drop('datetime', axis = 1, inplace = True)

In [None]:
train_df['weather'] = train_df.weather.map({1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'})
train_df['season'] = train_df.season.map({1:'Clear', 2:'Mist', 3:'Light rain', 4:'Heavy rain'})

test_df['weather'] = test_df.weather.map({1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'})
test_df['season'] = test_df.season.map({1:'Clear', 2:'Mist', 3:'Light rain', 4:'Heavy rain'})

In [None]:
train_df.info()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (18,9))
sns.barplot(x = 'season', y = 'count', data = train_df, ax = ax1)
sns.barplot(x = 'weather', y = 'count', data = train_df, ax = ax2)

In [None]:
cat_features = ['season', 'weather']
for cat in cat_features:
    train_df[cat] = train_df[cat].astype('category')
    test_df[cat] = test_df[cat].astype('category')

In [None]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [None]:
train_df.info()

In [None]:
train_df.tail()

In [None]:
fig, ax = plt.subplots(figsize = (20,15))
sns.heatmap(train_df.corr(), ax = ax, annot = True)

In [None]:
sns.catplot(x = 'hour', y = 'count', data = train_df, kind = 'bar', aspect = 3)

In [None]:
sns.catplot(x = 'month', y = 'count', data = train_df, kind = 'bar', aspect = 2)

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (20,10))
sns.lineplot(x = 'temp', y = 'count', data = train_df, ax = ax1)
sns.lineplot(x = 'humidity', y = 'count', data = train_df, ax = ax2)

In [None]:
X = train_df.drop('count', axis = 1, inplace = False)
y = train_df['count']

In [None]:
y.hist()

In [None]:
y = np.log1p(y)
y.hist()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


In [None]:
from sklearn.linear_model import LinearRegression

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)
np.sort(np.expm1(pred))

In [None]:
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) **2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

rmsle(np.expm1(y_test), np.expm1(pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, verbose = 1)
rf_reg.fit(X_train, y_train)
pred = rf_reg.predict(X_test)
np.sort(np.expm1(pred))

In [None]:
rmsle(np.expm1(y_test), np.expm1(pred))

In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(n_estimators = 1000)
xgb_reg.fit(X_train, y_train)
pred = xgb_reg.predict(X_test)
np.sort(np.expm1(pred))

In [None]:
rmsle(np.expm1(y_test), np.expm1(pred))

In [None]:
from lightgbm import LGBMRegressor

lgbm_reg = LGBMRegressor(n_estimators = 1000, n_jobs = -1, verbose = 1)
lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)
np.sort(np.expm1(pred))

In [None]:
rmsle(np.expm1(y_test), np.expm1(pred))

In [None]:
lgbm_reg.fit(X, y)
prediction = lgbm_reg.predict(test_df)
prediction = np.expm1(prediction)

In [None]:
np.sort(prediction)

In [None]:
submission = pd.DataFrame({'datetime':test_df_datetime, 'count':[max(0, x) for x in prediction]})

In [None]:
submission.head()

In [None]:
submission['count'] = np.round(submission['count']).astype('int')

In [None]:
submission.head()

In [None]:
submission.to_csv('bike_shareing_demand_submission.csv', index = False)