In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Forecast use of a city bikeshare system
Using these systems, people are able to rent a bike from a one location and return it to a different place on an as-needed basis. Currently, there are over 500 bike-sharing programs around the world.

## Feature engineering case
### A SHORT DESCRIPTION OF THE FEATURES.
datetime - hourly date + timestamp

season - 1 = spring, 2 = summer, 3 = autumn(fall), 4 = winter
- Autumn(It is often called fall in the United States because leaves fall from the trees at that time.)

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather -
- 1 : Clear, Few clouds, Partly cloudy, Partly cloudy
- 2 : Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- 3 : Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
- 4 : Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
    
temp - temperature in Celsius

atemp - "feels like" temperature in Celsius

humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals

**HERE ALL THE VARIABLES OR FEATURES ARE NUMERIC AND THE TARGET VARIABLE THAT WE HAVE TO PREDICT IS THE count VARIABLE. HENCE THIS IS A TYPICAL EXAMPLE OF A REGRESSION PROBLEM AS THE count VARIABLE IS CONTINUOUS VARIED.**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('../input/bike-sharing-demand/train.csv')
test = pd.read_csv('../input/bike-sharing-demand/test.csv')

In [None]:
train.info()

In [None]:
test.info()

### Combine training set and test set

In [None]:
tt = train.append(test)
tt = tt.reset_index().drop('index', axis=1)
tt.head()

### Training set analysis

In [None]:
sns.distplot(train['count'])

In [None]:
# Time processing
# Add two columns, date and hour respectively
temp = pd.DatetimeIndex(train['datetime'])
train['year'] = temp.year
train['date'] = temp.date
train['hour'] = temp.hour
# Categorical variables for the day of the week
train['dayofweek'] = pd.DatetimeIndex(train.date).dayofweek

In [None]:
# The impact of each time period of the day on count
sns.boxplot(train['hour'], train['count'])

In [None]:
# The influence of the total days of the week on count
sns.boxplot(train['dayofweek'], train['count'])

**THE EFFECT IS NOT OBVIOUS, CHANGE THE WAY OF PRESENTATION.**

In [None]:
# Changes in count for each day of the week
sns.pointplot(x='hour', y='count', hue='dayofweek', data=train)

In [None]:
# The impact of different months on count
train['month'] = pd.to_datetime(train['datetime']).dt.month
sns.boxplot(train['month'], train['count'])

In [None]:
# The impact of holidays on count
sns.pointplot(x='hour', y='count',hue='workingday', data=train)

**IT CAN BE SEEN THAT THE RELATIONSHIP BETWEEN WORKINGDAY AND COUNT IS VERY SIMILAR TO THE RELATIONSHIP BETWEEN DAYOFWEEK AND COUNT, AND ONE OF THE TWO CAN BE DELETED.**

In [None]:
# The impact of weather on count
sns.pointplot(x='hour', y='count', hue='weather', data=train)

In [None]:
# The influence of season on count
sns.pointplot(x='hour', y='count', hue='season', data=train)

## For discrete variables

In [None]:
# Pearson coefficient
cor=train[['temp', 'atemp', 'casual', 'registered', 'humidity','windspeed', 'count']].corr()
sns.heatmap(cor, square=True, annot=True)

### From the above:
- The temp and atemp variables are highly linearly related, choose one of them when modeling;
- The sum of casual and registered is count, just delete
- The humidity and windspeed variables are not highly correlated with count, so delete them directly
- Therefore, the variables to be modeled are hour, year, workingday, holiday, season, weather, atemp, count

### Feature processing on merged data

In [None]:
temp = pd.DatetimeIndex(tt['datetime'])
tt['year'] = temp.year
tt['hour'] = temp.hour
tt = tt[['hour', 'year', 'workingday', 'holiday', 'season', 'weather', 'atemp', 'count']]
# One-hot coding for discrete variables, such as color red, yellow, and blue coding as [[1,0,0], [0,1,0], [0,0,1]]
tt = pd.get_dummies(tt, columns=['hour'], prefix=['hour'], drop_first=True)
tt = pd.get_dummies(tt, columns=['year'], prefix=['year'], drop_first=True)
tt = pd.get_dummies(tt, columns=['season'], prefix=['season'], drop_first=True)
tt = pd.get_dummies(tt, columns=['weather'], prefix=['weather'], drop_first=True)
tt.head()

### Modeling Forecast

In [None]:
# Extract the training set and test set from the processed data set, [0:10886] and [10886:]
new_train = tt.iloc[:10886, :]
# Pair count+1, then take the logarithm
y = np.log1p(new_train['count'])
new_test = tt.iloc[10886:, :].drop('count',axis=1)
new_train.drop('count', axis=1, inplace=True)
x = new_train
x.head()

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)

### Try multiple linear regression

In [None]:
from sklearn.linear_model import LinearRegression
lmodel = LinearRegression()
lmodel.fit(x, y)
cross_val_score(lmodel, x, y, cv=5).mean()

In [None]:
lmodel.fit(x_train, y_train)
pre = lmodel.predict(x_test)
mean_squared_error(y_test, pre)

### Try random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfr = RandomForestRegressor(random_state=50, max_features='sqrt', oob_score=True)

In [None]:
# Parameter tuning - This step requires a lot of calculation
para = {'n_estimators': np.arange(200, 241, 1)}
rf = GridSearchCV(estimator=rfr, param_grid=para, cv=5)
rf.fit(x, y)

In [None]:
rf.best_params_

In [None]:
rfr = RandomForestRegressor(n_estimators=227, random_state=50, max_features='sqrt',oob_score=True)
cross_val_score(rfr, x, y, cv=5).mean()

In [None]:
rfr.fit(x_train, y_train)
pre = rfr.predict(x_test)
mean_squared_error(y_test, pre)

**FROM THE RESULTS, RANDOM FOREST IS BETTER THAN MULTIPLE LINEAR REGRESSION.**

In [None]:
rfr.fit(x,y)

In [None]:
co = rfr.predict(new_test)
m = []
# Decrease the result by one and round up
for i in (np.exp(co) - 1):
    n = round(i)  
    m.append(n)
predict = pd.DataFrame({'datetime': test['datetime'], 'count': m})
predict.to_csv('rfr.csv', index=False)
