In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
data = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv', parse_dates=['datetime'], index_col='datetime')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(20,5))
data['count'].rolling('1D').mean().pct_change().plot()
plt.show() 

In [None]:
sns.pairplot(data=data, y_vars=['count'], x_vars=data.columns)
plt.show()

In [None]:
fig, ax = plt.subplots(2,2, figsize = (18,9))
sns.barplot(x = 'season', y = 'count', data = data, ax = ax[0][0])
sns.barplot(x = 'weather', y = 'count', data = data, ax = ax[0][1])
sns.barplot(x = 'workingday', y = 'count', data = data, ax = ax[1][0])
sns.barplot(x = 'holiday', y = 'count', data = data, ax = ax[1][1])

In [None]:
X = data.drop('count', axis=1)
y = data['count']

X = X.drop(['casual', 'registered'], axis=1)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(X.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [None]:
def drop_columns(X):
    return X.drop(['atemp'], axis=1)

def add_date_columns(X):
    X['month'] = X.index.month
    X['year'] = X.index.year
    X['hour'] = X.index.hour
    return X

def pipline(X):
    X = drop_columns(X)
    X = add_date_columns(X)
    return X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

X_train = pipline(X_train)
X_test = pipline(X_test)

y_train_normalized = np.log1p(y_train)

In [None]:
model = RandomForestRegressor()
model.fit(X_train,y_train_normalized)
print("Train Score : ", mean_squared_log_error(y_train, np.expm1(model.predict(X_train))))
print("Test Score : ", mean_squared_log_error(y_test, np.expm1(model.predict(X_test))))

In [None]:
importances_rf = pd.Series(model.feature_importances_, index = X_train.columns) 
sorted_importances_rf = importances_rf.sort_values() 
sorted_importances_rf.plot(kind='barh', color='lightgreen')
plt.show()

In [None]:
test_data = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv', parse_dates=['datetime'], index_col='datetime')
X_test_data = pipline(test_data)
pred = model.predict(X_test_data)

In [None]:
submission = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv', parse_dates=['datetime'], index_col='datetime')
submission['count'] = np.expm1(pred)

In [None]:
submission.to_csv('my_submission.csv')
submission.head()