# 1. Importing stuff

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

# 2. Reading data

In [39]:
df = pd.read_csv('train (1).csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['month'] = df['datetime'].dt.month
df['hour'] = df['datetime'].dt.hour
df['weekday'] = df['datetime'].dt.weekday
df.head(0)
df.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,hour,weekday
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.096758,0.164011,0.163439,0.971524,-0.006546,-0.010553
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,0.043799,-0.020956,-0.005393,0.001731,-0.000354,-0.191832
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,-0.319111,0.11946,0.011594,-0.003394,0.00278,-0.704267
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.135918,-0.10934,-0.128655,0.012144,-0.02274,-0.047692
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454,0.257589,0.14543,-0.038466
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784,0.264173,0.140343,-0.040235
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371,0.204537,-0.278011,-0.026507
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369,-0.150192,0.146631,-0.024804
casual,0.096758,0.043799,-0.319111,-0.135918,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414,0.092722,0.302045,0.246959
registered,0.164011,-0.020956,0.11946,-0.10934,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948,0.169451,0.38054,-0.084427


In [4]:
categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

# 3. X, y values

In [5]:
X = df[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour',
       'weekday']]
y = df['count']

# 4. Train/Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 5. Exploratory Data Analysis

In [7]:
df.corr()

Unnamed: 0,temp,atemp,humidity,windspeed,casual,registered,count
temp,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454
atemp,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784
humidity,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371
windspeed,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369
casual,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414
registered,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948
count,0.394454,0.389784,-0.317371,0.101369,0.690414,0.970948,1.0


# Building/Training a Model

In [8]:
from sklearn.tree import DecisionTreeRegressor

In [9]:
dtr = DecisionTreeRegressor()

In [10]:
dtr.fit(X_train, np.log(y_train))

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [11]:
ypred = dtr.predict(X_train)
ypred_test = dtr.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_log_error

In [13]:
np.sqrt(mean_squared_log_error(y_train, np.exp(ypred)))

0.0037865764284111605

In [14]:
np.sqrt(mean_squared_log_error(y_test, np.exp(ypred_test)))

0.5324448191694592

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
rfr = RandomForestRegressor()

In [17]:
rfr.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [18]:
ypred1 = rfr.predict(X_train)

In [19]:
ypred2 = rfr.predict(X_test)

In [20]:
np.sqrt(mean_squared_log_error(y_train, ypred1))

0.20281569720831089

In [21]:
np.sqrt(mean_squared_log_error(y_test, ypred2))

0.4211485881865081

In [22]:
df_test = pd.read_csv('test.csv')

In [23]:
df_test['datetime'] = pd.to_datetime(df_test['datetime'])

In [24]:
df_test['month'] = df_test['datetime'].dt.month
df_test['hour'] = df_test['datetime'].dt.hour
df_test['weekday'] = df_test['datetime'].dt.weekday
df_test.head(0)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,hour,weekday


In [25]:
categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"]
for var in categoryVariableList:
    df_test[var] = df_test[var].astype("category")

In [26]:
df_test.drop(['datetime'], inplace=True, axis=1)

In [27]:
df_test.head(0)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,hour,weekday


In [28]:
X_train.head(0)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,hour,weekday


In [29]:
rfr.predict(df_test)

array([ 14.7,   4.7,   4.7, ..., 116.2, 109.2,  49.1])

In [30]:
pred_list = (rfr.predict(df_test)).tolist()

In [31]:
get_datetime = pd.read_csv('test.csv')

In [32]:
csv_final = {'datetime': get_datetime['datetime'],
             'count': pred_list}

In [33]:
csv_ultrafinal = pd.DataFrame(csv_final,columns= ['datetime', 'count'])

In [34]:
csv_ultrafinal.set_index('datetime', inplace=True)

In [35]:
csv_ultrafinal.to_csv(r'carol_bikesharing_prediction_second.csv')

# 7. Cross Validation

In [36]:
from sklearn.model_selection import cross_val_score

In [37]:
acc = cross_val_score(rfr, X_test, y_test, cv=10)
acc

array([0.81880121, 0.81625912, 0.80336831, 0.84213702, 0.79709522,
       0.79135898, 0.76312377, 0.81292626, 0.82339954, 0.79174047])

In [38]:
#Which factors contribute most to the number of bicycles being checkout out over the course of a given day?
#R: Temp/atemp, humidity, hour
#How much of an impact does weather have on demand?
#R: The worst the weather, the less people taking bikes.
#Which stations see the most traffic? (total number of pick-ups and drop-offs)
#Which stations are the most under-utilized?
#How does the behavior of casual users differ from members / subscribers? (e.g. seasonal variation, average duration of each ride)
#How much money is made from casual users (i.e. anything including single-trips to 5-day passes) in each region, per month?
#Any other analysis / visualizations that you think will help them in understanding rider / customer behavior better!