In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor as RFR

train = pd.read_csv('./data/input/train.csv')
test = pd.read_csv('./data/input/test.csv')

In [3]:
def rmsle(y, y_):
    log1 = np.nan_to_num(np.log(y + 1))
    log2 = np.nan_to_num(np.log(y_ + 1))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [4]:
train['year'] = train.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
train['month'] = train.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
train['day'] = train.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
train['hour'] = train.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

test['year'] = test.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
test['month'] = test.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
test['day'] = test.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
test['hour'] = test.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')

In [5]:
drop_cols = ['datetime', 'count', 'casual', 'registered']
test_use_cols = [col for col in train.columns if col not in drop_cols]

train_X, valid_X, train_y, valid_y = train_test_split(
    train.drop(columns=drop_cols),
    train.loc[:, 'count'],
    test_size=0.3, random_state=0)

In [6]:
model_rf = RFR(random_state=0)
model_rf.fit(train_X, train_y)

train_pred = model_rf.predict(train_X)
print('train score: ', rmsle(train_y, train_pred))

valid_pred = model_rf.predict(valid_X)
print('valid score: ', rmsle(valid_y, valid_pred))

train score:  0.15852735466708315
valid score:  0.35234408097265035


In [7]:
# GridSearchCV
if False:
    params = [{
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [5, 10, 15]
    }]

    gscv = GridSearchCV(
        RFR(random_state=0, max_features='log2'),
        param_grid=params,
        cv=3,
        verbose=2
        )

    gscv.fit(train_X, train_y)

In [8]:
# GridSearchCV check params
if False:
    model_rf_best = gscv.best_estimator_
    print(gscv.best_params_)

    train_pred = model_rf_best.predict(train_X)
    print('train score: ', rmsle(train_y, train_pred))

    valid_pred = model_rf_best.predict(valid_X)
    print('valid score: ', rmsle(valid_y, valid_pred))

    print(pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score').T)

In [9]:
model_rf_best = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_best.fit(train_X, train_y)

pred = model_rf_best.predict(train_X)
print('train score: ', rmsle(train_y, pred))

pred = model_rf_best.predict(valid_X)
print('valid score: ', rmsle(valid_y, pred))

train score:  0.28304786806763416
valid score:  0.3629093510948012


In [10]:
model_rf_test = RFR(random_state=0, max_depth=20, min_samples_leaf=5, n_estimators=200)
model_rf_test.fit(train.drop(columns=drop_cols), train.loc[:, 'count'])

pred = model_rf_test.predict(test.loc[:, test_use_cols])
df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred})
df_submit.to_csv('./data/output/submittion_RFR.csv', index=False)

In [11]:
# StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_X)
train_X_scaled = scaler.transform(train_X)
valid_X_scaled = scaler.transform(valid_X)

In [30]:
# Support Vector Machine Regressor
from sklearn import svm
model_svm = svm.SVR(kernel='rbf', C=1, epsilon=0.1, gamma='auto')
model_svm.fit(train_X_scaled, train_y)

In [31]:
pred = model_svm.predict(train_X_scaled)
print('train score: ', rmsle(train_y, pred))

pred = model_svm.predict(valid_X_scaled)
print('valid score: ', rmsle(valid_y, pred))

linear score:  1.2172661085927017
poly score:  1.3758377659944667
rbf score:  1.0613217756749298


  log2 = np.nan_to_num(np.log(y_ + 1))
  log2 = np.nan_to_num(np.log(y_ + 1))
  log2 = np.nan_to_num(np.log(y_ + 1))


In [45]:
print(train_y.mean()+train_y.std()*3)
print(train_y.mean()-train_y.std()*3)

734.9419583778013
-354.2422208449928


In [51]:
if False:
    gscv = GridSearchCV(
        svm.SVR(kernel='rbf', epsilon=70, gamma='auto'),
        param_grid=[{
            'C': [1600, 3200, 6400]
        }],
        cv=3,
        verbose=1
        )

    gscv.fit(train_X_scaled, train_y)

    model_svm_best = gscv.best_estimator_
    print(gscv.best_params_)

    pred = model_svm_best.predict(train_X_scaled)
    print('train score: ', rmsle(train_y, pred))

    pred = model_svm_best.predict(valid_X_scaled)
    print('valid score: ', rmsle(valid_y, pred))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
{'C': 3200}


  log2 = np.nan_to_num(np.log(y_ + 1))


train score:  1.1063882836679946
valid score:  1.139361511280924


  log2 = np.nan_to_num(np.log(y_ + 1))


In [53]:
model_svm = svm.SVR(kernel='rbf', C=700, epsilon=70, gamma='auto')
model_svm.fit(train_X_scaled, train_y)

pred = model_svm.predict(train_X_scaled)
print('train score: ', rmsle(train_y, pred))

pred = model_svm.predict(valid_X_scaled)
print('valid score: ', rmsle(valid_y, pred))

  log2 = np.nan_to_num(np.log(y_ + 1))


train score:  1.103979978443115
valid score:  1.1180522383132159


  log2 = np.nan_to_num(np.log(y_ + 1))


In [54]:
model_svm_final = svm.SVR(kernel='rbf', C=700, epsilon=70, gamma='auto')
model_svm_final.fit(train.drop(columns=drop_cols), train.loc[:, 'count'])

pred = model_svm_final.predict(test.loc[:, test_use_cols])
df_submit = pd.DataFrame({'datetime': test.datetime, 'count': pred})
df_submit.to_csv('./data/output/submittion_SVM.csv', index=False)