In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

In [None]:
from sklearn.linear_model import LinearRegression
import xgboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
train = pd.read_csv("/kaggle/input/bike-sharing-system-washington-dc/train_bikes.csv")
test = pd.read_csv("/kaggle/input/bike-sharing-system-washington-dc/test_bikes.csv")

In [None]:
display(train.head())
display(test.head())

In [None]:
display(train.shape)
display(test.shape)

In [None]:
display(train.info())
display(test.info())

In [None]:
train.describe()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.corr(), annot=True, cbar=False, linewidths=1)
plt.title("Correlation Matrix")
plt.show()

## season

In [None]:
display(train["season"].nunique())
display(test["season"].nunique())

In [None]:
display(train["season"].value_counts())
display(test["season"].value_counts())

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='season', data=train)

# Get current axis on current figure
ax = plt.gca()

# ylim max value to be set
y_max = train['season'].value_counts().max() 
ax.set_ylim([0, 3500])

# Iterate through the list of axes' patches
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, color='black', ha='center', va='bottom')

## holiday

In [None]:
display(train["holiday"].nunique())
display(test["holiday"].nunique())

In [None]:
display(train["holiday"].value_counts())
display(test["holiday"].value_counts())

## workingday

In [None]:
display(train["workingday"].nunique())
display(test["workingday"].nunique())

In [None]:
display(train["workingday"].value_counts())
display(test["workingday"].value_counts())

## weather

In [None]:
display(train["weather"].nunique())
display(test["weather"].nunique())

In [None]:
display(train["weather"].value_counts())
display(test["weather"].value_counts())

In [None]:
train["datetime"] = pd.to_datetime(train["datetime"])
train = train.set_index("datetime")

train.head()

In [None]:
test["datetime"] = pd.to_datetime(test["datetime"])
test = test.set_index("datetime")

test.head()

In [None]:
train["hour"] = train.index.hour
train["day_of_month"] = train.index.day
train["day_of_week"]  = train.index.dayofweek
train["month"] = train.index.month
train.head()

In [None]:
test["hour"] = test.index.hour
test["day_of_month"] = test.index.day
test["day_of_week"]  = test.index.dayofweek
test["month"] = test.index.month
test.head()

### 1.1 Amount of bike shares per month

In [None]:
plt.figure(figsize=(15, 7))
ax = sns.lineplot(x=train.index, y=train.count, data=train)
ax.set_title("Amount of bike shares vs date", fontsize=25)
ax.set_xlabel("Date", fontsize=20)
ax.set_ylabel('Amount of bike shares', fontsize=20)
plt.show()

In [None]:
# Resample timeseries, for plotting timeseries month frequency
train_by_month = train.resample("M").sum()
train_by_month.head()

In [None]:
plt.figure(figsize=(16,6))
ax = sns.lineplot(data=train_by_month, x=train_by_month.index, y=train_by_month.count)
ax.set_title("Amount of bike shares per month", fontsize=25)
ax.set_xlabel("Month", fontsize=20)
ax.set_ylabel('Amount of bike shares', fontsize=20)
plt.show()

## Amount of bike shares related to the windspeed

In [None]:
plt.figure(figsize=(20,10))

ax = sns.pointplot(x='windspeed', y='count', data=train)
ax.set_title("Amount of bike shares vs windspeed", fontsize=25)
ax.set_xlabel("Windspeed (km/h)", fontsize=20)
ax.set_ylabel('Amount of bike shares', fontsize=20)
plt.locator_params(axis='x', nbins=10)
plt.show()