In [18]:
import csv as csv
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from time import time

In [19]:
train = pd.read_csv('train.csv',header=0)
test = pd.read_csv('test.csv',header = 0)
print (train.shape)
print (test.shape)
train.drop(['casual','registered'],axis=1, inplace=True)
targets = train['count']
combined = train.drop('count',axis=1).append(test)
combined.reset_index(inplace = True, drop =True)
print (combined.shape)
print (targets.shape)

(10886, 12)
(6493, 9)
(17379, 9)
(10886,)


In [20]:
# Creating Hour (nominal categorical feature) and converting it into grouped boolean feature
combined['hour'] = combined.datetime.map(lambda date: date.split(':')[0].split(' ')[1])
combined['hour'] = combined['hour'].astype(int)
combined = combined.drop(['datetime'], axis =1)
combined['is_earlyMorning'] = ((combined['hour']>=0) &(combined['hour']<=6)).astype(int)
combined['is_morning'] = ((combined['hour']>6)&(combined['hour']<=10)).astype(int)
combined['is_day'] = ((combined['hour']>10)&(combined['hour']<=15)).astype(int)
combined['is_evening'] = ((combined['hour']>15)&(combined['hour']<=20)).astype(int)
combined['is_night'] = ((combined['hour']>20)&(combined['hour']<=23)).astype(int)
combined.drop('hour',axis=1,inplace=True)

# Creating humidity groups boolean features
combined['is_VeryLowHumidity'] = ((combined['humidity']>=0)&(combined['humidity']<=19)).astype(int)
combined['is_LowHumidity'] = ((combined['humidity']>=20)&(combined['humidity']<=40)).astype(int)
combined['is_MediumHumidity'] = ((combined['humidity']>40)&(combined['humidity']<=60)).astype(int)
combined['is_HighHumidity'] = ((combined['humidity']>60)&(combined['humidity']<=75)).astype(int)
combined['is_VeryHumidity'] = (combined['humidity']>75).astype(int)
combined.drop('humidity',axis=1,inplace=True)

# Creating windspeed groups boolean features
# combined['is_LowWindSpeed'] = ((combined['windspeed']>=0)&(combined['windspeed']<=10)).astype(int)
# combined['is_MediumWindSpeed'] = ((combined['windspeed']>10)&(combined['windspeed']<=42)).astype(int)
# combined['is_HighWindSpeed'] = (combined['windspeed']>42).astype(int)
# combined.drop('windspeed',axis=1,inplace=True)

In [83]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 19 columns):
season                17379 non-null int64
holiday               17379 non-null int64
workingday            17379 non-null int64
weather               17379 non-null int64
temp                  17379 non-null float64
atemp                 17379 non-null float64
is_earlyMorning       17379 non-null int32
is_morning            17379 non-null int32
is_day                17379 non-null int32
is_evening            17379 non-null int32
is_night              17379 non-null int32
is_VeryLowHumidity    17379 non-null int32
is_LowHumidity        17379 non-null int32
is_MediumHumidity     17379 non-null int32
is_HighHumidity       17379 non-null int32
is_VeryHumidity       17379 non-null int32
is_LowWindSpeed       17379 non-null int32
is_MediumWindSpeed    17379 non-null int32
is_HighWindSpeed      17379 non-null int32
dtypes: float64(2), int32(13), int64(4)
memory usage: 1.7 MB


In [84]:
# categories_windspeed = combined.windspeed.unique()
# categories_windspeed = np.sort(categories_windspeed)
# MeanCountwindspeed = []
# train = combined.loc[0:10885,:]
# train = pd.concat([targets,train],axis=1)
# for i in categories_windspeed:
#     MeanCountwindspeed.append(train.loc[train['windspeed'] == i,'count'].mean())
# plt.plot(categories_windspeed, MeanCountwindspeed)
# plt.show()
# categories_windspeed.size

In [85]:
# categories_humidity = combined.humidity.unique()
# categories_humidity = np.sort(categories_humidity)
# MeanCountHumidity = []
# train = combined.loc[0:10885,:]
# train = pd.concat([targets,train],axis=1)
# for i in categories_humidity:
#     MeanCountHumidity.append(train.loc[train['humidity'] == i,'count'].mean())
# plt.plot(categories_humidity, MeanCountHumidity)
# plt.show()
# categories_humidity.size

In [86]:
# categories_atemp = combined.atemp.unique()
# categories_atemp = np.sort(categories_atemp)
# MeanCountAtemp = []
# train = combined.loc[0:10885,:]
# train = pd.concat([targets,train],axis=1)
# for i in categories_atemp:
#     MeanCountAtemp.append(train.loc[train['atemp'] == i,'count'].mean())
# plt.plot(categories_atemp, MeanCountAtemp)

# categories_temp = combined.temp.unique()
# categories_temp = np.sort(categories_temp)
# MeanCounttemp=[]
# train = combined.loc[0:10885,:]
# train = pd.concat([targets,train],axis=1)
# for i in categories_temp:
#     MeanCounttemp.append(train.loc[train['temp'] == i,'count'].mean())
# plt.plot(categories_temp, MeanCounttemp)

# plt.show()

In [87]:
# MeanCountHourly = []
# train = combined.loc[0:10885,:]
# train = pd.concat([targets,train],axis=1)
# train.info()
# for i in range(0,24):
#     MeanCountHourly.append(train.loc[train['hour'] == i,'count'].mean())
# plt.plot(range(0,24),MeanCountHourly)
# plt.show()

In [21]:
#Recombining
train = combined.loc[0:10885,:]
test = combined.loc[10886:17378,:]

In [22]:
FeatureImp = pd.DataFrame()
FeatureImp['Feature'] = train.columns

train = preprocessing.scale(train)
test = preprocessing.scale(test)

clf  = RandomForestRegressor()

In [23]:
# Grid Search 
parameter_grid = {
                 'n_estimators' : [200, 210, 230, 240],
                 'max_depth': [4,5,6,7,9,10]
                 }
grid_search = GridSearchCV(clf, param_grid=parameter_grid, cv=5)
grid_search.fit(train, targets)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.2719287150790515
Best parameters: {'max_depth': 9, 'n_estimators': 230}


In [24]:
## Training Accuracy
clf  = RandomForestRegressor(n_estimators = 230, max_depth =9)
clf.fit(train, targets)
print ('Training Error: ', mean_squared_error(targets, clf.predict(train)))

Training Error:  8410.82736943


In [25]:
## Feature Importances
FeatureImp['Importance'] = clf.feature_importances_
FeatureImp.sort(['Importance'],ascending = False)

  app.launch_new_instance()


Unnamed: 0,Feature,Importance
7,is_earlyMorning,0.425899
4,temp,0.119744
10,is_evening,0.109827
11,is_night,0.053813
5,atemp,0.048901
0,season,0.045764
2,workingday,0.041371
9,is_day,0.040696
8,is_morning,0.028604
16,is_VeryHumidity,0.028564


In [None]:
print (np.mean(cross_val_score(clf, train, targets, cv = 5)))

In [26]:
pred = clf.predict(test)
pred[pred<0] = 0
dfPred = pd.DataFrame()
test1 = pd.read_csv('test.csv',header = 0)
dfPred['datetime'] = test1.datetime
dfPred['count'] = pd.DataFrame(pred)
dfPred.to_csv('mySubmissionBSD.csv', index=False)

In [17]:
dfPred

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,12.283810
1,2011-01-20 01:00:00,10.861005
2,2011-01-20 02:00:00,10.861005
3,2011-01-20 03:00:00,12.298192
4,2011-01-20 04:00:00,12.298192
5,2011-01-20 05:00:00,12.096622
6,2011-01-20 06:00:00,15.655072
7,2011-01-20 07:00:00,164.907655
8,2011-01-20 08:00:00,164.907655
9,2011-01-20 09:00:00,170.229278
