In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import MeasurePreprocess
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
dataset=pd.read_csv("train.csv")

In [3]:
dataset.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0,5,27,32


In [4]:
dataset.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


Extract hours and months features from datetime column 

In [5]:
dataset["hour"]=pd.DataFrame(dataset.datetime.apply(lambda x:x.split()[1].split(":")[0]))
dataset['hour'] = dataset['hour'].astype('int64')
dataset["month"]=pd.DataFrame(dataset.datetime.apply(lambda x:x.split()[0].split("-")[1]))
dataset['month'] = dataset['month'].astype('int64')
dataset=dataset[['season','holiday','workingday','weather','hour','month','temp','humidity','windspeed','count']]
dataset.describe()

Unnamed: 0,season,holiday,workingday,weather,hour,month,temp,humidity,windspeed,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,11.541613,6.521495,20.23086,61.88646,12.799395,191.574132
std,1.116174,0.166599,0.466159,0.633839,6.915838,3.444373,7.79159,19.245033,8.164537,181.144454
min,1.0,0.0,0.0,1.0,0.0,1.0,0.82,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,6.0,4.0,13.94,47.0,7.0015,42.0
50%,3.0,0.0,1.0,1.0,12.0,7.0,20.5,62.0,12.998,145.0
75%,4.0,0.0,1.0,2.0,18.0,10.0,26.24,77.0,16.9979,284.0
max,4.0,1.0,1.0,4.0,23.0,12.0,41.0,100.0,56.9969,977.0


Filter out the outlier data

In [6]:
upper_bound=dataset['count'].mean()+3*dataset['count'].std()
lower_bound=dataset['count'].mean()-3*dataset['count'].std()
dataset=dataset[dataset["count"]<upper_bound]
dataset=dataset[dataset["count"]>lower_bound]

In [7]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Adding dummy varibles for categorical feature 'season' 'holiday' 'workingday' 'weather' 'hour' 'month'

In [8]:
labelencoder = LabelEncoder()
onehotencoder = OneHotEncoder(categorical_features = range(6))
X = onehotencoder.fit_transform(X).toarray()

Division training set and testing set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

GridSearchCV for optimal parameter 

In [10]:
forest_reg01 = RandomForestRegressor(random_state=42)
param_test01 = {'n_estimators':range(50,110,10)}
gsearch1 = GridSearchCV(forest_reg01, param_test01, cv=5,return_train_score=True)
gsearch1.fit(X_train,y_train)
print gsearch1.best_params_, gsearch1.best_score_

{'n_estimators': 90} 0.819624009469


In [11]:
forest_reg02 = RandomForestRegressor(random_state=42,n_estimators=100)
param_test02 = {'max_depth':[18,20,22,24,26],'max_features':[0.6,0.7,0.8,0.9,1]}
gsearch02 = GridSearchCV(forest_reg02,param_test02 ,cv=5,return_train_score=True)
gsearch02.fit(X_train,y_train)
print gsearch02.best_params_, gsearch02.best_score_

{'max_features': 0.6, 'max_depth': 26} 0.822663541485


In [12]:
clf = RandomForestRegressor(n_estimators=100, max_features=0.6,max_depth=25,random_state=42)
scores = cross_val_score(clf, X_train,y_train, cv=20)
print "Cross validation scores:",scores
clf = clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
y_pred[y_pred<0]=0
metrics.mean_squared_error(y_test, y_pred)
print "MSE:",metrics.mean_squared_error(y_test, y_pred)

Cross validation scores: [ 0.83060772  0.82928133  0.80844567  0.8461105   0.84119236  0.81526789
  0.83497142  0.82066906  0.81958444  0.81363871  0.81556917  0.83357631
  0.82532069  0.80869001  0.84894884  0.80133625  0.80941508  0.83019554
  0.8190883   0.85632771]
MSE: 4833.25709699


In [13]:
def ErrorRate(test,pred):
    avg=[1]*len(test)
    for i in range(len(test)):
        try:
            avg[i]=abs(float(test[i]) - float(pred[i]))/float(test[i])
        except ZeroDivisionError,e:
            print e.message
    avg=np.array(avg)
    return avg.mean()

In [14]:
print "Error rate:",ErrorRate(y_test,y_pred)

Error rate: 0.659506859071
