In [25]:
from __future__ import division
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.svm import SVR
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn import cross_validation, linear_model
import matplotlib.pyplot as plt
from sklearn.lda import LDA
from sklearn import metrics
import numpy as np
from sklearn.lda import LDA
from sklearn.pipeline import Pipeline

In [26]:
bike_sharing_demand = pd.read_csv('./data/input/train.csv')
prediction_data = pd.read_csv('data/input/test.csv')
train_data, train_labels = bike_sharing_demand.ix[:, 'datetime':'windspeed'], bike_sharing_demand.ix[:, 'casual':]
prediction_data = prediction_data.ix[:, 'datetime':'windspeed']
np.random.seed(0)
shuffle = np.random.permutation(np.arange(train_data.shape[0]))
mini_bike_sharing = bike_sharing_demand.ix[shuffle[:100], :]
mini_train_data, mini_train_labels = mini_bike_sharing.ix[:, 'datetime':'windspeed'], mini_bike_sharing.ix[:, 'casual':]
# Let's extract the information
for dataset in (train_data, prediction_data):
    dataset['hour'] = dataset['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour)
    dataset['weekday'] = dataset['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday())
    dataset['month'] = dataset['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month)
    dataset['year'] = dataset['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).year)

cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=10, random_state=0)
np.random.seed(0)

In [27]:
# Define the Root-Mean-Squared-Log Error function for scoring predictions
def rmsle(actual_values, predicted_values):
    squared_log_errors = (np.log(np.array(predicted_values) + 1) - np.log(np.array(actual_values) + 1)) ** 2
    mean_squared_errors = np.nansum(squared_log_errors) / len(squared_log_errors)
    return np.sqrt(mean_squared_errors)

# LDA 

In [28]:
clf_lda = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', LDA())
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_lda.fit(x_train, y_train)
    predicted_values = clf_lda.predict(x_test)
    #print("Cross-Validated Error Loop LDA  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_lda.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for LDA model : {}".format(np.mean(scores)))


Mean Score for LDA model : 0.499600147744


# Linear Model

In [29]:
clf_linear = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', LinearRegression())
])

scores = []
#cv, train_data, train_labels = bikeStandardizeData.readData()
for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_linear.fit(x_train, y_train)
    predicted_values = clf_linear.predict(x_test)
    # print("Cross-Validated Error Loop Linear  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_linear.score(x_test, y_test))

# print scores
#print [score for score in scores]
print("Mean Score for linear model : {}".format(np.mean(scores)))

Mean Score for linear model : 0.385857244125


#Polynomial Regression with degree 3

In [30]:
clf_poly = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_poly.fit(x_train, y_train)
    predicted_values = clf_poly.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_poly.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for Polynomial  model : {}".format(np.mean(scores)))

Mean Score for Polynomial  model : 0.500183337095


#KNN Regressor

In [31]:
clf_knn = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', KNeighborsRegressor(n_neighbors=5))
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_knn.fit(x_train, y_train)
    predicted_values = clf_knn.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_knn.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for KNN model : {}".format(np.mean(scores)))


Mean Score for KNN model : 0.543411500089


# Ridge Regression with polynomial features with degree 3

In [32]:
clf_ridge = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
                    normalize=False, solver='auto', tol=0.001))
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_ridge.fit(x_train, y_train)
    predicted_values = clf_ridge.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_ridge.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for Ridge  model : {}".format(np.mean(scores)))

Mean Score for Ridge  model : 0.560505452821


# Lasso with Polynomial features with degree 3

In [33]:
clf_lasso = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
                    normalize=False, positive=False, precompute=False,
                    tol=0.0001, warm_start=False))
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_lasso.fit(x_train, y_train)
    predicted_values = clf_lasso.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_lasso.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for Lasso  model : {}".format(np.mean(scores)))


Mean Score for Lasso  model : 0.566113620985


   # Support Vector Regression - RBF

In [34]:

clf_svrbf = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', SVR(kernel='rbf', C=1e3, gamma=0.1))
])
for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_svrbf.fit(x_train, y_train)
    predicted_values = clf_svrbf.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_svrbf.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for clf_svrbf : {}".format(np.mean(scores)))

Mean Score for clf_svrbf : 0.555016156543


#Support Vector Regression - Linear Kernel

In [35]:
clf_svrlin = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', SVR(kernel='linear', C=1e3))
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_svrlin.fit(x_train, y_train)
    predicted_values = clf_svrlin.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_svrlin.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for clf_svrlin : {}".format(np.mean(scores)))

Mean Score for clf_svrlin : 0.522593321198


#Support Vector Regression - Polynomial Kernel

In [36]:
clf_svrpoly = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', SVR(kernel='poly', C=1e3, degree=3))
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_svrpoly.fit(x_train, y_train)
    predicted_values = clf_svrpoly.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_svrpoly.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for clf_svrpoly : {}".format(np.mean(scores)))

Mean Score for clf_svrpoly : 0.493094327572


#Random Forest Regressor

In [37]:
rf = RandomForestRegressor(random_state=0, n_estimators=100)
clf_rf = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', rf)
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_rf.fit(x_train, y_train)
    predicted_values = clf_rf.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_rf.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for clf_rf : {}".format(np.mean(scores)))


Mean Score for clf_rf : 0.53968546637


# Gradient Boosting Regressor

In [38]:
gbm = GradientBoostingRegressor(loss='ls', alpha=0.95, n_estimators=500, max_depth=4, learning_rate=.01,
                                min_samples_leaf=9, min_samples_split=9)

clf_gbm = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('normalizer', Normalizer()),
    ('model', gbm)
])

for i, (train_index, test_index) in enumerate(cv):
    x_train, y_train = train_data.ix[train_index, 'season':], train_labels.ix[train_index, 'count']
    x_test, y_test = train_data.ix[test_index, 'season':], train_labels.ix[test_index, 'count']
    clf_gbm.fit(x_train, y_train)
    predicted_values = clf_gbm.predict(x_test)
    #print("Cross-Validated Error Loop KNN  {0}: {1}".format(i, bikeStandardizeData.rmsle(y_test, predicted_values)))
    scores.append(clf_gbm.score(x_test, y_test))

#print scores
#print [score for score in scores]
print("Mean Score for clf_gbm : {}".format(np.mean(scores)))


Mean Score for clf_gbm : 0.565670349171
