In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)

You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month,while the test set is the 20th to the end of the month.
You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period.

### Read data

In [None]:
trdf = pd.read_csv('../input/bike-sharing-demand/train.csv')
tedf = pd.read_csv('../input/bike-sharing-demand/test.csv')
trdf.rename(columns={'count':'total number'},inplace=True);

In [None]:
trdf['datetime'] = pd.to_datetime(trdf['datetime']) 
tedf['datetime'] = pd.to_datetime(tedf['datetime'])
trdf.info()
print('-----------------------------------------------------------')
tedf.info()

### import some important modules.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
trdf['total number'].describe()

In [None]:
trdf['registered'].describe()

In [None]:
trdf['casual'].describe()

In [None]:
trdf.head()

### We cant use number in categorical varibale specially nomial vairable, becuase they dont have ratio and interval characteristics.

In [None]:
df=trdf.copy()
df['season'].replace(1, 'Spring',inplace=True)
df['season'].replace(2, 'Summer',inplace=True)
df['season'].replace(3, 'Fall',inplace=True)
df['season'].replace(4, 'Winter',inplace=True)

df['holiday'].replace(0, 'NotHoliday',inplace=True)
df['holiday'].replace(1, 'Holiday',inplace=True)

df['workingday'].replace(0, 'NotWorkingD',inplace=True)
df['workingday'].replace(1, 'WorkingD',inplace=True)

weather=['Clear','Little','Light_S_R','Heavy_S_R']
df['weather'].replace(1, weather[0],inplace=True)
df['weather'].replace(2, weather[1],inplace=True)
df['weather'].replace(3, weather[2],inplace=True)
df['weather'].replace(4, weather[3],inplace=True)
print('Converting numberic value of categorical column to string, this action need for dummying process.')
df.head()

### Seperate year month day and hour, we can use them in the model.

In [None]:
df=df.assign(year=df.datetime.dt.year, month=df.datetime.dt.month, day=df.datetime.dt.day, hour=df.datetime.dt.hour)
df.drop('datetime',axis=1,inplace=True)
df.head()

### Dummying categirical variables, this process need before feeding to the model.

drop_first=True -> Delete some columns that are not helpful.\
For example holiday_Holiday column can explain holiday situation then dont need to holiday_NotHoliday column.

In [None]:
#Get k-1 dummies out of k categorical levels by removing the first level.
df_d = pd.get_dummies(df,drop_first=True)
df_d.head()

In previous section (EDA section) we saw that temp and atemp are very high correlated.
I will not delete atemp but i will test it in linear regression.

In [None]:
# removing cacual. registered  columns.
df_d.drop(['casual','registered'],axis=1,inplace=True)
df_d.info()

We have count variable(total number) as response variable.\
For count variable suitable use poisson regression or other common regression model like randomForestRegressor that even have poisson loss function or criteria suitable for poisson\
The first i use linear regression then random forest regressor and after that i will use possion regression too.

### Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### spliting and shuffling the data

In [None]:
#seperate total number column as target variable.
totalNumber=df_d['total number']
df_d.drop('total number',axis=1,inplace=True)
x_train,x_test,y_train,y_test = train_test_split(df_d,totalNumber,test_size=0.25,random_state=42)
print('df_d.shape:',df_d.shape)
print('x_train.shape:',x_train.shape)
print('x_test.shape:',x_test.shape)
print('y_train.shape:',y_train.shape)
print('y_test.shape:',y_test.shape)
df_d.head()

In [None]:
x_test.head()

In [None]:
x_train.head()

In [None]:
totalNumber[:10]

### Scaling data
I use two scaling method: Normalization and standadization

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
numerical_ix = x_train.select_dtypes(include=['int64', 'float64']).columns
ct_norm   = ColumnTransformer(transformers=[('minmax', MinMaxScaler(), numerical_ix)], remainder='passthrough')
ct_standr = ColumnTransformer(transformers=[('standr', StandardScaler(), numerical_ix)], remainder='passthrough')
numerical_ix

### Linear regression with and without scaling

In [None]:
#create model
from sklearn.pipeline import Pipeline
MLregModel = LinearRegression()
pipeline_norm = Pipeline(steps=[('ct_norm',ct_norm),('model',MLregModel)])
pipeline_standr = Pipeline(steps=[('ct_standr',ct_standr),('model',MLregModel)])

### creating a function for fitting and evaluating the models.

In [None]:
def fit_eval_model(model, x_train, y_train, x_test, y_test, metricList):
    m = model
    m.fit(x_train,y_train)
    y_train_pred = m.predict(x_train)
    y_test_pred = m.predict(x_test)
    metricDataframe =pd.DataFrame()
    for name,metric in metricList:
        metricDataframe[name+'_train'] = [round(metric(y_train,y_train_pred),3)]
        metricDataframe[name+'_test'] = [round(metric(y_test,y_test_pred),3)]
    return metricDataframe,y_train_pred,y_test_pred,m

### Fitting model(Linear regression) and test it without scaling.

In [None]:
metricList = [('r2_score',r2_score),('MSE',mean_squared_error)]
LRNoScaling_result = fit_eval_model(LinearRegression(),x_train, y_train, x_test, y_test, metricList)
LRNoScaling_result[0]

In [None]:
plt.figure(figsize=(11,6))
plt.scatter(y_train,LRNoScaling_result[1])
ax = plt.gca()
ax.set(title='Scatter plot between y_train and y_train_pred')
ax.set_xlabel('total number in train section')
ax.set_ylabel('total number prediction');

plt.figure(figsize=(11,6))
plt.scatter(y_test,LRNoScaling_result[2])
ax = plt.gca()
ax.set(title='Scatter plot between y_test and y_test_pred')
ax.set_xlabel('total number in test section')
ax.set_ylabel('total number prediction');



LR model without scaling made awful prediction.

### Fitting model(Linear regression) and test it with scaling (MinMaxScaler).

In [None]:

LRScaled_norm_result= fit_eval_model(pipeline_norm,x_train, y_train, x_test, y_test, metricList)
LRScaled_norm_result[0]

In [None]:

LRScaled_standr_result = fit_eval_model(pipeline_standr,x_train, y_train, x_test, y_test, metricList)
LRScaled_standr_result[0]

Unfortunately, the above models are no different.

### ### use 'Polynomial trnasformation' to create complex LR model.

In [None]:
pipeline_poly = Pipeline(steps=[('poly',PolynomialFeatures(degree=3  )),('model',LinearRegression() )])

PolyLR_result = fit_eval_model(pipeline_poly,x_train, y_train, x_test, y_test, metricList)
PolyLR_result[0]

In [None]:
plt.figure(figsize=(11,6))
plt.scatter(y_train,PolyLR_result[1])
ax = plt.gca()
ax.set(title='Scatter plot between y_train and y_train_pred')
ax.set_xlabel('total number in train section')
ax.set_ylabel('total number prediction');

plt.figure(figsize=(11,6))
plt.scatter(y_test,PolyLR_result[2])
ax = plt.gca()
ax.set(title='Scatter plot between y_test and y_test_pred')
ax.set_xlabel('total number in test section')
ax.set_ylabel('total number prediction');

the complex model by Polynomial trnasformation degree=2 help to beter fit.
but scatter plot show that some prediction is negative sign that is not good.

### I will combine Ridge or lasso regression with polynomial to close test and train MSE.

In [None]:
from sklearn.linear_model import Ridge,Lasso

In [None]:
pipeline_polyRidge = Pipeline(steps=[('ct_standr',ct_standr),('poly',PolynomialFeatures(degree=3  )),('model',Ridge(alpha=100) )])
polyRidgeLR_result = fit_eval_model(pipeline_polyRidge,x_train, y_train, x_test, y_test, metricList)
polyRidgeLR_result[0]

In [None]:
pipeline_polyLasso = Pipeline(steps=[('ct_standr',ct_standr),('poly',PolynomialFeatures(degree=3  )),('model',Lasso(alpha=0.001) )])
polyLassoLR_result = fit_eval_model(pipeline_polyRidge,x_train, y_train, x_test, y_test, metricList)
polyLassoLR_result[0]

### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
randomForestR1_result = fit_eval_model(RandomForestRegressor(random_state=0,criterion='squared_error'),
                                       x_train, y_train, x_test, y_test, metricList)
randomForestR1_result[0]

In [None]:
randomForestR2_result = fit_eval_model(RandomForestRegressor(n_estimators=100,random_state=0,criterion='poisson')
                                       ,x_train, y_train, x_test, y_test, metricList)
randomForestR2_result[0]

In [None]:
plt.figure(figsize=(11,6))
plt.scatter(y_train,randomForestR1_result[1])
ax = plt.gca()
ax.set(title='Scatter plot between y_train and y_train_pred')
ax.set_xlabel('total number in train section')
ax.set_ylabel('total number prediction');

plt.figure(figsize=(11,6))
plt.scatter(y_test,randomForestR1_result[2])
ax = plt.gca()
ax.set(title='Scatter plot between y_test and y_test_pred')
ax.set_xlabel('total number in test section')
ax.set_ylabel('total number prediction');

In [None]:
from scipy.stats import spearmanr
spearmanr(randomForestR1_result[1],y_train)

In [None]:
spearmanr(randomForestR1_result[2],y_test)

These two RandomForestRegressor are better than previous regressors, but pay attention to 'MSE'? It is a sign of over-fitting.\
I should regulate some hyperparameters to test's and train's "MSE" become close together,where the model trained generalize better in test data.\
I will use RandomizedSearchCV. Due to hardware limitations, I put the range of hyperparameters in a low range.

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
model = RandomForestRegressor()
params = dict()
params['n_estimators']=[2,5,10,30,100,200]
params['max_depth'] = [1,2,5,10,20,None]
params['min_samples_split']=[2,5,10,20,50,100]
params['min_samples_leaf']=[1,2,5,10,20,50,100]
params['max_features']=[1,2,5,10,12,14,16]
params['criterion']=['squared_error','poisson']
scoring = ['r2','neg_mean_squared_error','neg_mean_absolute_error','neg_mean_poisson_deviance']

In [None]:
search = RandomizedSearchCV(model, params, n_iter=200, scoring=scoring, n_jobs=-1, cv=cv,
                            random_state=1,return_train_score=True,refit='neg_mean_squared_error',error_score='raise')

In [None]:
result = search.fit(df_d, totalNumber)

In [None]:
result.scorer_

In [None]:
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
plt.figure(figsize=(11,6))
plt.scatter(y_train,result.predict(x_train))
ax = plt.gca()
ax.set(title='Scatter plot between y_train and y_train_pred')
ax.set_xlabel('total number in train section')
ax.set_ylabel('total number prediction');

plt.figure(figsize=(11,6))
plt.scatter(y_test,result.predict(x_test))
ax = plt.gca()
ax.set(title='Scatter plot between y_test and y_test_pred')
ax.set_xlabel('total number in test section')
ax.set_ylabel('total number prediction');

In [None]:
cv_resultsDf_RF=pd.DataFrame(result.cv_results_)
print(cv_resultsDf_RF.shape)
cv_resultsDf_RF.head()

In [None]:
cv_resultsDf_RF.columns

In [None]:
filter_col = [col for col in cv_resultsDf_RF.columns if col.startswith('mean')]
filter_col

In [None]:
scoreDF_RF = cv_resultsDf_RF[filter_col[2:]]
scoreDF_RF

In [None]:
scoreDF_RF[scoreDF_RF.mean_test_neg_mean_squared_error==scoreDF_RF.mean_test_neg_mean_squared_error.max()]

Sounds like a good result.\
It should be noted that there are other models, but we will suffice with just a few models and assign the last model to the Poisson model.

### Poisson regression
For this goal there are many packages and classes that support this type of regression.\
In addition, we can use common regressors like linear regression (depending on what our evaluation criteria are).\
The following packages and classes also have Poisson criteria.\
- sklearn -> RandomForestRegressor
- sklearn -> HistGradientBoostingRegressor
- statsmodels -> statsmodels.api.GLM  (Generalized Linear Models)
- xgboost -> XGBRegressor 

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_poisson_deviance


In [None]:
metricList = [('r2_score',r2_score),('MSE',mean_squared_error),('MPD',mean_poisson_deviance)]

poissonModel=linear_model.PoissonRegressor(max_iter=100)
pipeline_polyPoissonR = Pipeline(steps=[('ct_norm',ct_norm),('poly',PolynomialFeatures(degree=1  )),('model',poissonModel )])

polyPoissonR_result = fit_eval_model(pipeline_polyPoissonR,x_train, y_train, x_test, y_test, metricList)
polyPoissonR_result[0]

In [None]:
poissonModel=linear_model.PoissonRegressor(max_iter=300)
pipeline_polyPoissonR = Pipeline(steps=[('ct_norm',ct_norm),('poly',PolynomialFeatures(degree=2  )),('model',poissonModel )])

polyPoissonR_result = fit_eval_model(pipeline_polyPoissonR,x_train, y_train, x_test, y_test, metricList)
polyPoissonR_result[0]

In [None]:
poissonModel=linear_model.PoissonRegressor(max_iter=400)
pipeline_polyPoissonR = Pipeline(steps=[('ct_norm',ct_norm),('poly',PolynomialFeatures(degree=3  )),('model',poissonModel )])

polyPoissonR_result = fit_eval_model(pipeline_polyPoissonR,x_train, y_train, x_test, y_test, metricList)
polyPoissonR_result[0]

Well, it seems enough and we are content with the same random forest model.\
Now we pre-process the kaggle test data and make the prediction.

In [None]:
tedf.head()

### Read sampleSubmission.csv and use it to save resualts.

In [None]:
resultForSub = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
print('resultForSub.shape: ',resultForSub.shape)
print('test data shape: ',tedf.shape)
resultForSub.head()

### Convert categorical number to string

In [None]:
df=tedf.copy()
df['season'].replace(1, 'Spring',inplace=True)
df['season'].replace(2, 'Summer',inplace=True)
df['season'].replace(3, 'Fall',inplace=True)
df['season'].replace(4, 'Winter',inplace=True)

df['holiday'].replace(0, 'NotHoliday',inplace=True)
df['holiday'].replace(1, 'Holiday',inplace=True)

df['workingday'].replace(0, 'NotWorkingD',inplace=True)
df['workingday'].replace(1, 'WorkingD',inplace=True)

weather=['Clear','Little','Light_S_R','Heavy_S_R']
df['weather'].replace(1, weather[0],inplace=True)
df['weather'].replace(2, weather[1],inplace=True)
df['weather'].replace(3, weather[2],inplace=True)
df['weather'].replace(4, weather[3],inplace=True)
print('Converting numberic value of categorical column to string, this action need for dummying process.')
df.head()

### Seperate year month day and hour, we can use them in the model.

In [None]:
df=df.assign(year=df.datetime.dt.year, month=df.datetime.dt.month, day=df.datetime.dt.day, hour=df.datetime.dt.hour)
df.drop('datetime',axis=1,inplace=True)
df.head()

### Dummying categirical variables, this process need before feeding to the model.

drop_first=True -> Delete some columns that are not helpful.\
For example holiday_Holiday column can explain holiday situation then dont need to holiday_NotHoliday column.

In [None]:
#Get k-1 dummies out of k categorical levels by removing the first level.
df_dte = pd.get_dummies(df,drop_first=True)
df_dte.head()

In [None]:
df_dte.info()

In [None]:
countPrediction = result.predict(df_dte)

In [None]:
countPrediction.shape

In [None]:
resultForSub['count'] = countPrediction
resultForSub.head()

In [None]:
resultForSub.set_index('datetime',inplace=True)
resultForSub.head()

In [None]:
file_name = 'PredictionTestData_result.csv'
resultForSub.to_csv(file_name)

I will use another RandomForest regressor that the first made it.It name was randomForestR1_result

In [None]:
countPrediction2 = randomForestR1_result[3].predict(df_dte)
countPrediction2.shape

In [None]:
resultForSub2 = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
resultForSub2['count'] = countPrediction2
resultForSub2.set_index('datetime',inplace=True)
resultForSub2.head()

In [None]:
file_name = 'PredictionTestData_result2.csv'
resultForSub2.to_csv(file_name)

Kaggle gave the first model a better score.

<img src="./scoreSubmission.png">