# Notebook about bike sharing dataset

## Task Description:
### make an explorative data analysis and build a prediction model for the hourly utilization “cnt” of this data set: 
### https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset.

### Imports

In [112]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
%matplotlib inline

### Dataset Description:
    Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv

- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

### Load the dataset

In [113]:
def load_df(filepath):
    '''
    load the dataset into the system using pandas.read_csv function
    
    Parameters:
    filepath: Path to the dataset
    
    Returns:
    df: Generated dataframe
    '''
    df=pd.read_csv(filepath)
    return df

In [114]:
def data_prep(df):
    '''Preparation of the dataset into train and test data to used in the models
    
        Parameters:
        df: dataframe name

        Returns:
        X_train,X_test,y_train,y_test: train and test dataset

    '''
    
    #check for missing values:
    assert pd.notnull(df).all().all()
    
    #feature generation from date:
    '''We already have features like Month, Year and dayofweek as column of the dataframe.'''
    df['day'] = pd.DatetimeIndex(df['dteday']).day
    
    # Normalization of the right skewed distribution target variable:
    print('Normalizing count column of df')
    df['cnt'] = np.log(df['cnt'])
    print('Normalization done')
    
    # Drop leakage columns as they generate too good to be true prediction score for the models:
    df = df.drop(['dteday', 'atemp', 'casual', 'registered'],axis=1)
    
    #df[['cnt','temp','hum','windspeed']] = df[['cnt','temp','hum','windspeed']].apply(lambda x: x/x.max(), axis=0)
    '''columns = ['season', 'mnth','hr','weekday','weathersit', 'day']
    for col in columns:
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
    df = pd.concat([df, dummies], axis=1)'''
    
    #Divide dataframe into trainset and testset
    X = df.drop(['cnt'], axis=1)
    y = df['cnt']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
    
    return X_train, X_test, y_train, y_test

In [115]:
def generate_model(X_train, X_test, y_train, y_test, regressor):
    
    # The parameters of the regressors are generated after applying GridSearchCV
    
    # Random Forest Regressor
    regressor.fit(X = X_train, y = np.log1p(y_train))
    y_preds = regressor.predict(X_test)
    score = regressor.score(X_test,np.log1p(y_test))
    mae = metrics.mean_absolute_error(y_test, np.exp(y_preds))
    
    return score, mae, y_preds

In [128]:
def test_model(filepath):
    """ load dataset, build feature set, and do learning
        Parameters
        ----------
        fn: file name of dataset
        features: a list of list, each of which is a feature list for different models
        type: str for indicating feature set
        
        Returns
        -------
        predictions and feature-engineered dataset are saved to files
    """
    df = load_df(filepath)
    print(df.head())

    X_train, X_test, y_train, y_test = data_prep(df)
    X_test.to_csv('E:\\Personal\\tasks\\Xtest.csv',index=False)
    y_test.to_csv('E:\\Personal\\tasks\\Ytest.csv',index=False)    
    for i, regressor in enumerate((
        RandomForestRegressor(n_estimators=100, max_depth=20),
        GradientBoostingRegressor(n_estimators=150, max_depth=10, min_samples_leaf=20, learning_rate=0.1),
        SVR(kernel='rbf', C=50)
        )):
        score, mae, y_preds = generate_model(X_train, X_test, y_train, y_test, regressor)
        rname = str(regressor).split('(')[0]
        #print(rname, y_preds)
        print(rname, score, mae)
        results = pd.DataFrame({'hr': X_test.loc[:,'hr'], 'cnt': y_test, 'prediction': np.exp(y_preds)})
        results.to_csv('E:\\Personal\\tasks\\results.csv', index = False, columns=['hr', 'cnt', 'prediction'])
        
    

In [129]:
if __name__ == "__main__":
    test_model(filepath='E:\\Personal\\tasks\\hour.csv')

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1   0        0        6           0   
1        2  2011-01-01       1   0     1   1        0        6           0   
2        3  2011-01-01       1   0     1   2        0        6           0   
3        4  2011-01-01       1   0     1   3        0        6           0   
4        5  2011-01-01       1   0     1   4        0        6           0   

   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  
0           1  0.24  0.2879  0.81        0.0       3          13   16  
1           1  0.22  0.2727  0.80        0.0       8          32   40  
2           1  0.22  0.2727  0.80        0.0       5          27   32  
3           1  0.24  0.2879  0.75        0.0       3          10   13  
4           1  0.24  0.2879  0.75        0.0       0           1    1  
Normalizing count column of df
Normalization done
RandomForestRegressor 0.89332532398 0.99239923003