In [89]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
color = sns.color_palette()
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures, RobustScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [109]:
def prepare_data(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].apply(lambda row: row.hour)
    df['day'] = df['datetime'].apply(lambda row: row.day)
    df['month'] = df['datetime'].apply(lambda row: row.month)
    df['year'] = df['datetime'].apply(lambda row: row.year)
    df['weekday'] = df['datetime'].apply(lambda row: row.weekday())
    return df

In [3]:
def rmsle(pred, target):
    log1 = np.nan_to_num(np.log1p(pred))
    log2 = np.nan_to_num(np.log1p(target))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [4]:
train = pd.read_csv('data/train.csv.gz', compression='gzip')
train = prepare_data(train).drop(['casual', 'registered'], axis=1)
train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour,day,month,year,weekday
0,1,0,0,1,9.84,14.395,81,0.0,16,0,1,1,2011,5
1,1,0,0,1,9.02,13.635,80,0.0,40,1,1,1,2011,5
2,1,0,0,1,9.02,13.635,80,0.0,32,2,1,1,2011,5
3,1,0,0,1,9.84,14.395,75,0.0,13,3,1,1,2011,5
4,1,0,0,1,9.84,14.395,75,0.0,1,4,1,1,2011,5


In [110]:
test = pd.read_csv('data/test.csv.gz', compression='gzip')
test = prepare_data(test)
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour,day,month,year,weekday
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,0,20,1,2011,3
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1,20,1,2011,3
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2,20,1,2011,3
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,3,20,1,2011,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,4,20,1,2011,3


In [5]:
binary = ['holiday', 'workingday']
categorical = ['season', 'weather', 'hour', 'day', 'month', 'year', 'weekday']
numeric = ['temp', 'atemp', 'humidity', 'windspeed']

In [6]:
def means(df, columns, target = 'count'):
    means_ = {}
    for col in columns:
        means_[col]=np.round(df.groupby([col])[target].mean())
    return means_

In [7]:
means(train, categorical)

{'day': day
 1     180.0
 2     184.0
 3     195.0
 4     196.0
 5     190.0
 6     190.0
 7     184.0
 8     179.0
 9     188.0
 10    195.0
 11    196.0
 12    191.0
 13    194.0
 14    196.0
 15    202.0
 16    191.0
 17    206.0
 18    193.0
 19    192.0
 Name: count, dtype: float64, 'hour': hour
 0      55.0
 1      34.0
 2      23.0
 3      12.0
 4       6.0
 5      20.0
 6      76.0
 7     213.0
 8     363.0
 9     222.0
 10    175.0
 11    211.0
 12    257.0
 13    258.0
 14    243.0
 15    254.0
 16    316.0
 17    469.0
 18    431.0
 19    315.0
 20    229.0
 21    173.0
 22    134.0
 23     90.0
 Name: count, dtype: float64, 'month': month
 1      90.0
 2     110.0
 3     148.0
 4     184.0
 5     219.0
 6     242.0
 7     235.0
 8     234.0
 9     234.0
 10    228.0
 11    194.0
 12    176.0
 Name: count, dtype: float64, 'season': season
 1    116.0
 2    215.0
 3    234.0
 4    199.0
 Name: count, dtype: float64, 'weather': weather
 1    205.0
 2    179.0
 3    119.0
 4   

In [8]:
means(train, binary)

{'holiday': holiday
 0    192.0
 1    186.0
 Name: count, dtype: float64, 'workingday': workingday
 0    189.0
 1    193.0
 Name: count, dtype: float64}

In [99]:
def means_encoder(data, columns):
    df = pd.DataFrame(index=data.index)
    t = means(train, columns) # yes, train!!!
    for col in columns:
        df[col+'_mean'] = data[col].apply(lambda c: t[col][c])
    return df

In [43]:
def to_categorical(data, columns):
    df = data.copy()[columns]
    for col in columns:
        df[col] = df[col].astype('category')
    return df

In [58]:
def poly(data, columns):
    return pd.DataFrame(RobustScaler().fit_transform(PolynomialFeatures(degree=2).fit_transform(data[columns])), index=data.index)

In [59]:
categ = ['hour', 'day', 'month', 'weekday']
categ_m = ['season', 'weather', 'year']

y = train['count']
X = pd.concat([means_encoder(train, categ_m), train[categ],train[binary].astype('bool'), poly(train, numeric)], axis=1)
# X = pd.concat([train[categorical],train[binary].astype('bool'), train[numeric]], axis=1)
X.head()

Unnamed: 0,season_mean,weather_mean,year_mean,hour,day,month,weekday,holiday,workingday,0,...,5,6,7,8,9,10,11,12,13,14
0,116.0,205.0,144.0,0,1,1,5,False,False,0.0,...,-0.654422,-0.626388,-0.425785,-0.925551,-0.553655,-0.245368,-0.929607,0.730376,-1.264428,-0.704221
1,116.0,205.0,144.0,1,1,1,5,False,False,0.0,...,-0.685714,-0.659286,-0.503817,-0.925551,-0.584663,-0.314815,-0.929607,0.687097,-1.264428,-0.704221
2,116.0,205.0,144.0,2,1,1,5,False,False,0.0,...,-0.685714,-0.659286,-0.503817,-0.925551,-0.584663,-0.314815,-0.929607,0.687097,-1.264428,-0.704221
3,116.0,205.0,144.0,3,1,1,5,False,False,0.0,...,-0.654422,-0.626388,-0.486853,-0.925551,-0.553655,-0.325135,-0.929607,0.478763,-1.264428,-0.704221
4,116.0,205.0,144.0,4,1,1,5,False,False,0.0,...,-0.654422,-0.626388,-0.486853,-0.925551,-0.553655,-0.325135,-0.929607,0.478763,-1.264428,-0.704221


In [80]:
regressor = RandomForestRegressor(n_estimators=50, random_state=42)
score = cross_val_score(regressor, X, y, cv=5, scoring=rmsle_scorer)
score.mean()

-0.48588259762661307

In [60]:
regressor = linear_model.LinearRegression()
score = cross_val_score(regressor, pd.get_dummies(X, columns=categ), y, cv=5, scoring=rmsle_scorer)
score.mean()

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


-1.202774091100405

In [75]:
regressor = XGBRegressor(n_estimators=500, max_depth=10)
score = cross_val_score(regressor, X, y, cv=5, scoring=rmsle_scorer)
score.mean()

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


-0.47837759976496202

In [88]:
regressor = KNeighborsRegressor(n_neighbors=3, metric='manhattan', weights='distance', algorithm='kd_tree')
score = cross_val_score(regressor, X, y, cv=5, scoring=rmsle_scorer)
score.mean()

-0.83835810842185976

In [68]:
regressor = SVR()
score = cross_val_score(regressor, X, y, cv=5, scoring=rmsle_scorer)
score.mean()

-1.4590169164894751

In [65]:
from mlxtend.regressor import StackingRegressor

In [93]:
regressor = StackingRegressor(regressors=[
        XGBRegressor(n_estimators=500, max_depth=10), RandomForestRegressor(n_estimators=50), RandomForestRegressor(n_estimators=200)
    ], meta_regressor=RandomForestRegressor(n_estimators=200))

score = cross_val_score(regressor, X, y, cv=5, scoring=rmsle_scorer)
score.mean()

-0.47148786303008483

In [100]:
X_test = pd.concat([means_encoder(test, categ_m), test[categ], test[binary].astype('bool'), poly(test, numeric)], axis=1)
X_test.head()

Unnamed: 0,season_mean,weather_mean,year_mean,hour,day,month,weekday,holiday,workingday,0,...,5,6,7,8,9,10,11,12,13,14
0,116.0,205.0,144.0,0,20,1,3,False,True,0.0,...,-0.63375,-0.662308,-0.672065,0.230764,-0.721741,-0.737764,0.150055,-0.261779,1.243215,2.313847
1,116.0,205.0,144.0,1,20,1,3,False,True,0.0,...,-0.63375,-0.623395,-0.672065,-0.852988,-0.639135,-0.6299,-0.873973,-0.261779,-1.160572,-0.504489
2,116.0,205.0,144.0,2,20,1,3,False,True,0.0,...,-0.63375,-0.623395,-0.672065,-0.852988,-0.639135,-0.6299,-0.873973,-0.261779,-1.160572,-0.504489
3,116.0,205.0,144.0,3,20,1,3,False,True,0.0,...,-0.63375,-0.636337,-0.672065,-0.394467,-0.668275,-0.665776,-0.382966,-0.261779,-0.143562,0.0
4,116.0,205.0,144.0,4,20,1,3,False,True,0.0,...,-0.63375,-0.636337,-0.672065,-0.394467,-0.668275,-0.665776,-0.382966,-0.261779,-0.143562,0.0


In [101]:
regressor.fit(X, y)
pred = regressor.predict(X_test)

In [113]:
prediction = pd.DataFrame(index=test.index)
prediction['datetime'] = test['datetime']
prediction['count'] = pred

prediction.to_csv('submission.csv', index=False)
prediction.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,13.085
1,2011-01-20 01:00:00,6.36
2,2011-01-20 02:00:00,4.865
3,2011-01-20 03:00:00,1.97
4,2011-01-20 04:00:00,2.23
