# DC Bikeshare
## Contents: 
### 1. Exploratory Data Analysis
### 2. Preprocessing
### 3. Machine Learning 

# Exploratory Data Analysis

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
os.chdir(r'../input/')

#train_url = r'https://www.kaggle.com/c/bike-sharing-demand/download/train.csv'
#test_url = r'https://www.kaggle.com/c/bike-sharing-demand/download/train.csv'

df_train = pd.read_csv('train.csv')
df_train.head()


In [None]:
def null_percentage(column):
    df_name = column.name
    nans = np.count_nonzero(column.isnull().values)
    total = column.size
    frac = nans / total
    perc = int(frac * 100)
    print('%d%% of values or %d missing from %s column.' % (perc, nans, df_name))

def check_null(df, columns):
    for col in columns:
        null_percentage(df[col])
        
check_null(df_train, df_train.columns)

This dataset is shockingly clean. 

Extract hour and month data from datetime

In [None]:
df_train['month'] = df_train.apply(lambda x: x['datetime'][5:7], axis=1).astype(int)
df_train['hour'] = df_train.apply(lambda x: x['datetime'][11:13], axis=1).astype(int)

Temperature is arbitrary. Change "feels like" temperature to deviation from the mean of 24, which is a comfortable temperature. 

In [None]:
median_temp = df_train.atemp.median()
df_train['temp_dev'] = df_train.apply(lambda x: x['atemp'] - median_temp, axis=1)

Look at ridership by month, and peek at the difference in casual and registered ridership. 

In [None]:
#Ridership by Month
    
plt.figure('Daily rides by month', figsize=(10, 15))
plt.subplot(211)
sns.boxplot(x='month', y='count', hue='workingday', data=df_train)

plt.subplot(212)
sns.boxplot(x='month', y='count', hue='holiday', data=df_train)
plt.show()

plt.figure('Casual', figsize=(10, 15))
plt.subplot(211)
sns.boxplot(x='month', y='casual', hue='workingday', data=df_train)

plt.subplot(212)
sns.boxplot(x='month', y='casual', hue='holiday', data=df_train)
plt.show()

plt.figure('Registered', figsize=(10, 15))
plt.subplot(211)
sns.boxplot(x='month', y='registered', hue='workingday', data=df_train)

plt.subplot(212)
sns.boxplot(x='month', y='registered', hue='holiday', data=df_train)
plt.show()

Registered use is higher on working days while casual ridership is higher on weekends. This could suggest tourist traffic and commuter traffic making up a significant portion of usage at different times. 

Five holidays have a significant impact on ridership, and they affect casual and registered ridership differently. Locals leave town on holidays, while tourists come. I want to make a features representing each specifically because they will have different impacts on the model. 

In [None]:
df_train.loc[(df_train.month == 4) & (df_train.holiday == 1), 'holiday_4'] = 1
df_train.loc[df_train.holiday_4.isnull(), 'holiday_4'] = 0

df_train.loc[(df_train.month == 7) & (df_train.holiday == 1), 'holiday_7'] = 1
df_train.loc[df_train.holiday_7.isnull(), 'holiday_7'] = 0

df_train.loc[(df_train.month == 9) & (df_train.holiday == 1), 'holiday_9'] = 1
df_train.loc[df_train.holiday_9.isnull(), 'holiday_9'] = 0

df_train.loc[(df_train.month == 10) & (df_train.holiday == 1), 'holiday_10'] = 1
df_train.loc[df_train.holiday_10.isnull(), 'holiday_10'] = 0

df_train.loc[(df_train.month == 11) & (df_train.holiday == 1), 'holiday_11'] = 1
df_train.loc[df_train.holiday_11.isnull(), 'holiday_11'] = 0

Let's look at seasons. They are grouped by threes, so Jan-Mar is 1, Apr-Jun is 2, Jul-Sept is 3, Oct-Dec is 4. 

In [None]:
plt.figure('Total Rides by Season', figsize=(10, 15))
plt.subplot(311)
sns.boxplot(x='season', y='count', hue='workingday', data=df_train)

plt.subplot(312)
sns.boxplot(x='season', y='casual', hue='workingday', data=df_train)

plt.subplot(313)
sns.boxplot(x='season', y='registered', hue='workingday', data=df_train)

plt.show()

Look at the impact "feels like" temp has on ridership. The temperature is deviation from the median temperature rather than the actual temperature. 

In [None]:
plt.figure('Ridership v Temp', figsize=(10, 15))
plt.subplot(311)
sns.regplot(x='temp_dev', y='count', data=df_train, x_bins=10, order=2)
plt.subplot(312)
sns.regplot(x='temp_dev', y='casual', data=df_train, x_bins=10, order=2)
plt.subplot(313)
sns.regplot(x='temp_dev', y='registered', data=df_train, x_bins=10, order=2)
plt.show()

Let's check this against the actual temp.

In [None]:
plt.figure('Ridership v Actual Temp', figsize=(10, 15))
plt.subplot(311)
sns.regplot(x='temp', y='count', data=df_train, x_bins=10, order=2)
plt.subplot(312)
sns.regplot(x='temp', y='casual', data=df_train, x_bins=10, order=2)
plt.subplot(313)
sns.regplot(x='temp', y='registered', data=df_train, x_bins=10, order=2)
plt.show() 

Humidity: 

In [None]:
plt.figure('Ridership v Humidity', figsize=(10, 15))
plt.subplot(311)
sns.regplot(x='humidity', y='count', data=df_train, x_bins=10, order=2)
plt.subplot(312)
sns.regplot(x='humidity', y='casual', data=df_train, x_bins=10, order=2)
plt.subplot(313)
sns.regplot(x='humidity', y='registered', data=df_train, x_bins=10, order=2)
plt.show() 

Ridership is more correlated to humidity than I expected. It drops off to just about nothing when it approaches 100%. 

Now let's look at wind speed and ridership. 

In [None]:
plt.figure('Ridership v Wind', figsize=(10, 15))
plt.subplot(311)
sns.regplot(x='windspeed', y='count', data=df_train, x_bins=15, order=3)
plt.subplot(312)
sns.regplot(x='windspeed', y='casual', data=df_train, x_bins=15, order=3)
plt.subplot(313)
sns.regplot(x='windspeed', y='registered', data=df_train, x_bins=15, order=3)
plt.show() 

This is very unexpected, there are more riders on windy days... 

In [None]:
plt.figure('Wind by month')
sns.boxplot(x='month', y='windspeed', data=df_train)
plt.show()

This suggests wind speed is a day-to-day variation rather than a seasonal trend. 

Let's check out the weather. 1 is a sunny to cloudy, 2 is misting, 3 is light rain, and 4 is heavy rain.  

In [None]:
df_train.weather.value_counts()

In two years only one day had bad weather. 

In [None]:
plt.figure('Weather and Ridership', figsize=(10, 15))
plt.subplot(311)
sns.boxplot(x='weather', y='count', data=df_train)
plt.subplot(312)
sns.boxplot(x='weather', y='casual', data=df_train)
plt.subplot(313)
sns.boxplot(x='weather', y='registered', data=df_train)
plt.show()

Mist has a small effect on ridership, but rain drops it by about half among both casual and registered riders. With a sample size of one, heavy rain is useless so let's group that in with light rain. 

In [None]:
df_train.loc[df_train['weather'] == 4, 'weather'] = 3
df_train.weather.value_counts()

Now let's look at all of the features together: 

In [None]:
def heatmap(df):
    plt.figure('heatmap')
    df_corr = df.corr()
    sns.heatmap(df_corr, vmax=0.6, square=True, annot=False)
    plt.yticks(rotation = 0)
    plt.xticks(rotation = 90)
    plt.show()
    
heatmap(df_train)

# Preprocessing

Clear EDA dataframe from memory and load the combined data for feature engineering. 

In [None]:
import pandas as pd
df_train = pd.DataFrame()
df_train = pd.read_csv('train.csv', header=0)

df = df_train.copy()
df_test = pd.read_csv('test.csv', header=0)

df_train['train'] = 1
df_test['train'] = 0
df = pd.concat([df_train, df_test], ignore_index=False, axis=0) 
del(df_test, df_train)

#check_null(df, df.columns)

Looks good. 63% is training data and 37% is test data. 

In [None]:
df['month'] = df.apply(lambda x: x['datetime'][5:7], axis=1).astype(int)
df['hour'] = df.apply(lambda x: x['datetime'][11:13], axis=1).astype(int)

median_temp = df.atemp.median()
df['temp_dev'] = df.apply(lambda x: x['atemp'] - median_temp, axis=1)

df.loc[(df.month == 4) & (df.holiday == 1), 'holiday_4'] = 1
df.loc[df.holiday_4.isnull(), 'holiday_4'] = 0
df.loc[(df.month == 7) & (df.holiday == 1), 'holiday_7'] = 1
df.loc[df.holiday_7.isnull(), 'holiday_7'] = 0
df.loc[(df.month == 9) & (df.holiday == 1), 'holiday_9'] = 1
df.loc[df.holiday_9.isnull(), 'holiday_9'] = 0
df.loc[(df.month == 10) & (df.holiday == 1), 'holiday_10'] = 1
df.loc[df.holiday_10.isnull(), 'holiday_10'] = 0
df.loc[(df.month == 11) & (df.holiday == 1), 'holiday_11'] = 1
df.loc[df.holiday_11.isnull(), 'holiday_11'] = 0

I'm going to load all columns into a list so they can be easily commented out for testing. 

In [None]:
columns_used = ['weather'
                #'datetime'
                , 'season'
                #, 'holiday'
                , 'workingday'
                #, 'month'
                #, 'temp'
                #, 'atemp'
                #, 'humidity'
                #, 'windspeed'
                , 'casual'
                , 'registered'
                #, 'count'
                , 'hour'
                , 'temp_dev'
                #, 'holiday_4'
                #, 'holiday_7'
                #, 'holiday_9'
                #, 'holiday_11'
                #, 'holiday_10'
                , 'training']

Break up into testing, training, and submission datasets. 

In [None]:
df_submission = df.loc[df['train'] == 0]
df_submission = df_submission.drop(['train', 'casual', 'registered', 'count'], axis=1)
X_submission = df_submission.drop(['datetime'], axis=1)

df = df.loc[df['train'] == 1]
y_casual = df['casual']
y_count = df['count']
y_reg = df['registered']

X = df.drop(['datetime', 'train', 'casual', 'registered', 'count'], axis=1)

### Strategy

Because the correlation between features and ridership varies significantly between casual and registered riders, my strategy is going to be creating separate predictive models then summing the results. 

In [None]:
from sklearn.model_selection import train_test_split

Xc_train, Xc_test, yc_train, yc_test = train_test_split(X, y_casual, test_size=0.2)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y_reg, test_size=0.2)

In [None]:
''' Feature Scaling ''' 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
Xc_train = sc_X.fit_transform(Xc_train)
Xr_train = sc_X.fit_transform(Xr_train)
Xc_test = sc_X.fit_transform(Xc_test)
Xr_test = sc_X.fit_transform(Xr_test)
X_submission = sc_X.fit_transform(X_submission)

In [None]:
# Random Forest


from sklearn.ensemble import RandomForestRegressor
cRegRF = RandomForestRegressor(n_estimators=50, bootstrap=True)
rRegRF = RandomForestRegressor(n_estimators=50, bootstrap=True)
cRegRF.fit(Xc_train, yc_train)
rRegRF.fit(Xr_train, yr_train)
yc_pred = cRegRF.predict(Xc_test)
yr_pred = rRegRF.predict(Xr_test)


In [None]:
def RMSLE(y_true, y_pred):
    sum_val = 0
    for true, pred in zip(y_true, y_pred):
        sum_val += (np.log(pred + 1) - np.log(true + 1)) ** 2
    sum_val = sum_val / len(y_true)
    return np.sqrt(sum_val)

y_pred_tot = np.array(yc_pred) + np.array(yr_pred)
y_true_tot = np.array(yc_test) + np.array(yr_test)

print('casual')
print(RMSLE(np.array(yc_test), np.array(yc_pred)))
print('registered')
print(RMSLE(np.array(yr_test), np.array(yr_pred)))
print('total')
print(RMSLE(y_true_tot, y_pred_tot))


This was an okay first attempt I think, but the submission score was terrible. I'm going to try some different engineered features next, like "weekday rush hour" and "weekend afternoon."

Unfortunately, I didn't realize until attempting backward elimination that the odd split between training and test data renders a few featres like "holiday" useless. I wil need to start again from the beginning. 