# Bike demand predict

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("../input/bike-sharing-demand/train.csv")
test = pd.read_csv("../input/bike-sharing-demand/test.csv")
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

Range of variable in train and test is similar. So for now, i don't remove outlier

<br><br>

## Create variables and visualization
create columns from datetime

In [None]:
for df in [train, test]:
    df["datetime"] =  pd.DatetimeIndex(df["datetime"])
    df["hour"] = [x.hour for x in df["datetime"]]
    df["weekday"] = [x.dayofweek for x in df["datetime"]]
    df["month"] = [x.month for x in df["datetime"]]
    df["year"] = [x.year for x in df["datetime"]]
    df['year_season'] = df['year'].astype(str) + "_" +  df['season'].astype(str) 
    df["year"] = df["year"].map({2011:1, 2012:0})
    df.drop('datetime',axis=1,inplace=True)

See variables's distribution by distplot and countplot

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(15,10))
plt.suptitle('variables distribution')
plt.subplots_adjust(hspace = 0.5, wspace = 0.3)
for i, col in enumerate(train.columns[:11]):
    plt.subplot(3,4,i+1)
    if str(train[col].dtypes)[:3]=='int':
        if len(train[col].unique()) > 5:
            sns.distplot(train[col])
        else:
            sns.countplot(train[col])
    else:
        sns.distplot(train[col])
    plt.ylabel(col)

see relation of categorical predictors and outcomes by countplot

In [None]:
plt.figure(figsize=(13,20))
plt.suptitle('casual vs registered vs count')
plt.subplots_adjust(hspace = 0.5, wspace = 0.3)
col_list = ["season","holiday","workingday","weather","year","year_season","month","weekday","hour"]
count_list = ["casual","registered","count"]

for i, col in enumerate(col_list):
    for j, con in enumerate(count_list):
        plt.subplot(9,3,3*i+j+1)
        sns.barplot(train[col],train[con])

In count of holiday, workingday and weekday, there is no difference depending on categories.
but in registered and casual, it depend of the categories. So need to look at this part differently.

see relationship between weekday and each count by workingday and holiday

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(121)
sns.barplot(x="weekday", y="casual", hue="workingday", data=train)
plt.subplot(122)
sns.barplot(x="weekday", y="registered", hue="workingday", data=train)

There is no holiday in Tuesday and Thursday.
And there is differences when Monday, Wednesday, and Friday.

In [None]:
train.head()

see relationship between hour and each count by workingday and holiday

In [None]:
plt.figure(figsize=(18,11))
plt.subplot(221)
sns.pointplot(x="hour", y="casual", hue="workingday", data=train)
plt.subplot(222)
sns.pointplot(x="hour", y="casual", hue="holiday", data=train)
plt.subplot(223)
sns.pointplot(x="hour", y="registered", hue="workingday", data=train)
plt.subplot(224)
sns.pointplot(x="hour", y="registered", hue="holiday", data=train)
# train.pivot_table(index="hour", columns="workingday", aggfunc="size")

The number of registered and casual according to workingday and holiday show the opposite pattern.
And there are differences in the number of registered according to workingday at the closing hour and the office-going hour.
So many registered is can be expected to workers.

## correlation

In [None]:
plt.figure(figsize=(11,11))
sns.heatmap(train.corr(),annot=True,cmap="Blues")

temp and atemp have high correlation and register and have too.
And windspeed and outcomes have low correlation(<=0.1)
See scatterplot of temp and atemp.

In [None]:
for i, df in enumerate([train,test]):
    plt.subplot(1,2,i+1)
    sns.scatterplot(x = 'temp', y = 'atemp',data = df)

In train data, there is strange pattern, but not in test.
It seems to be haved wrong value in atemp.
So based on correlation and scatterplot, judged to remove atemp

Based on the above results, make new variable.

In [None]:
df_list = {"train":None, "test" : None}
for name, df in zip(df_list.keys(),[train, test]):
    df['windspeed'] = np.log(df['windspeed']+1)
    df["weekday_working"] = df["weekday"]*df["workingday"]
    df["weekday_holiday"] = df["weekday"]*df["holiday"]
    df['casual_workhour'] = df[['hour', 'workingday']].apply(lambda x: int(x['workingday'] == 0 and 10 <= x['hour'] <= 19), axis=1)
    df['casual_holi_hour'] = df[['hour', 'holiday']].apply(lambda x: int(x['holiday'] == 1 and 9 <= x['hour'] <= 22), axis=1)
    df['register_workhour'] = df[['hour', 'workingday']].apply(
      lambda x:int((x['workingday'] == 1 and (6 <= x['hour'] <= 8 or 17 <= x['hour'] <= 20))
        or (x['workingday'] == 0 and 10 <= x['hour'] <= 15)), axis=1)
    df['register_holi_hour'] = df[['hour', 'holiday']].apply(
      lambda x:int(x['holiday'] == 0 and (7 <= x['hour'] <= 8 or 17 <= x['hour'] <= 18)), axis=1)
    df.drop('atemp',axis=1,inplace=True)
by_season = train.groupby('year_season')[['count']].median()
by_season.columns = ['count_season']
train1 = train.join(by_season, on='year_season').drop('year_season',axis=1)
test1 = test.join(by_season, on='year_season').drop('year_season',axis=1)

#### Divide predictors and outcomes. And take logging outcomes to normalize.

In [None]:
from sklearn.model_selection import train_test_split
y_list = ["casual","registered","count"]
train_x = train1[[col for col in train1.columns if col not in ['casual','registered', 'count']]]
train_y = np.log(train1[y_list]+1)

## Modeling

#### - 1. lightgbm + cross validation

Use lightgbm model, and use cross-validation to prevent overfitting

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

folds = KFold(n_splits = 5, shuffle = True, random_state = 123)
rms1,rms2 = [],[]
models1,models2 = [], []
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train1)) :
    x_train, y_train = train_x.ix[trn_idx], train_y.ix[trn_idx] 
    x_val, y_val = train_x.ix[val_idx], train_y.ix[val_idx]
    
    lgb_param = {'boosting_type':'gbdt',
             'num_leaves': 45,
             'max_depth': 30,
            'learning_rate': 0.01, 
            'bagging_fraction' : 0.9,
            'bagging_freq': 20,
            'colsample_bytree': 0.9,
             'metric': 'rmse',
            'min_child_weight': 1,
            'min_child_samples': 10,
             'zero_as_missing': True,
            'objective': 'regression',
            }
    train_set1 = lgb.Dataset(x_train, y_train["registered"], silent=False)
    valid_set1 = lgb.Dataset(x_val, y_val["registered"], silent=False)
    lgb_model1 = lgb.train(params = lgb_param, train_set = train_set1 , num_boost_round=5000, early_stopping_rounds=100,verbose_eval=500, valid_sets=valid_set1)
    train_set2 = lgb.Dataset(x_train, y_train["casual"], silent=False)
    valid_set2 = lgb.Dataset(x_val, y_val["casual"], silent=False)
    lgb_model2 = lgb.train(params = lgb_param, train_set = train_set2 , num_boost_round=5000, early_stopping_rounds=100,verbose_eval=500, valid_sets=valid_set2)
    models1.append(lgb_model1)
    models2.append(lgb_model2)

see feature importance

In [None]:
tmp = pd.DataFrame({'Feature': x_train.columns, 'Feature importance': lgb_model1.feature_importance()})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (15,15))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)

In [None]:
preds = []
for model in models1:
    regi_pred = model.predict(test1)
    preds.append(regi_pred)
fin_casual = np.mean(preds, axis=0)

preds = []
for model in models2:
    casual_pred = model.predict(test1)
    preds.append(casual_pred)
fin_regi = np.mean(preds, axis=0)
count_pred1 = np.exp(fin_casual) + np.exp(fin_regi) - 2

#### - 2. lgbmRegressor + crossvalidation + Bayesian optimization

In [None]:
from sklearn.model_selection import KFold 
from sklearn.metrics import mean_squared_error
def lgb_cv(num_leaves, learning_rate, n_estimators, reg_alpha, reg_lambda, min_split_gain, min_child_weight,min_child_samples, colsample_bytree, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMRegressor(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
            min_split_gain= min_split_gain,
            min_child_weight = min_child_weight,
            min_child_samples = int(min_child_samples),
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict(x_valid)
        true = y_valid
        score -= mean_squared_error(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
from functools import partial 
from bayes_opt import BayesianOptimization
func_fixed1 = partial(lgb_cv, x_data=train_x, y_data=train_y["casual"], n_splits=5, output='score')
func_fixed2 = partial(lgb_cv, x_data=train_x, y_data=train_y["registered"], n_splits=5, output='score')
lgbBO = BayesianOptimization(
    func_fixed1, 
    {
        'num_leaves': (30, 100),    
        'learning_rate': (0.001, 0.015),  
        'n_estimators': (1000, 3000),                        
        'reg_alpha': (0.0001, 1),       
        'reg_lambda': (0.0001, 1), 
        'min_split_gain' : (0.001, 0.1),
        'min_child_weight' : (0.001, 0.1),
        'min_child_samples' : (10,25),
        'colsample_bytree': (0.85, 1.0),
    }, 
    random_state=4321            
)
lgbBO.maximize(init_points=5, n_iter=20)
lgbB1 = BayesianOptimization(
    func_fixed2, 
    {
        'num_leaves': (30, 100),    
        'learning_rate': (0.001, 0.015),  
        'n_estimators': (1000, 3000),                        
        'reg_alpha': (0.0001, 1),       
        'reg_lambda': (0.0001, 1), 
        'min_split_gain' : (0.001, 0.1),
        'min_child_weight' : (0.001, 0.1),
        'min_child_samples' : (10,25),
        'colsample_bytree': (0.85, 1.0),
    }, 
    random_state=4321            
)
lgbB1.maximize(init_points=5, n_iter=20)

In [None]:
params1 = lgbBO.max['params']
params2 = lgbB1.max['params']
lgb_models1 = lgb_cv(
    params1['num_leaves'], 
    params1['learning_rate'], 
    params1['n_estimators'], 
    params1['reg_alpha'], 
    params1['reg_lambda'], 
    params1['min_split_gain'], 
    params1['min_child_weight'],
    params1['min_child_samples'],
    params1['colsample_bytree'],
    x_data=train_x, y_data=train_y["casual"], n_splits=5, output='model')
lgb_models2 = lgb_cv(
    params2['num_leaves'], 
    params2['learning_rate'], 
    params2['n_estimators'], 
    params2['reg_alpha'], 
    params2['reg_lambda'], 
    params2['min_split_gain'], 
    params2['min_child_weight'],
    params2['min_child_samples'],
    params2['colsample_bytree'],
    x_data=train_x, y_data=train_y["registered"], n_splits=5, output='model')
preds = []
for model in lgb_models1:
    pred = model.predict(test1)
    preds.append(pred)
casual_pred = np.mean(preds, axis=0)
preds = []
for model in lgb_models2:
    pred = model.predict(test1)
    preds.append(pred)
registered_pred = np.mean(preds, axis=0)
count_pred2 = np.exp(casual_pred) + np.exp(registered_pred) - 2

#### - 3. randomforest and gradientboostingregressor

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
preds = {}
regs = {"gbdt": GradientBoostingRegressor(random_state=0),
        "rf": RandomForestRegressor(random_state=0, n_jobs=-1)}
for name, reg in regs.items():
    if name == 'gbdt':
        reg.set_params(n_estimators=1500, min_samples_leaf=6)
    elif name == 'rf':
        reg.set_params(n_estimators=1500, min_samples_leaf=2)
    reg.fit(train_x, train_y['casual'])
    pred_casual = reg.predict(test1)
    pred_casual = np.exp(pred_casual) - 1
    pred_casual[pred_casual < 0] = 0
    if name == 'gbdt':
        reg.set_params(n_estimators=1500, min_samples_leaf=6)
    elif name == 'rf':
        reg.set_params(n_estimators=1500, min_samples_leaf=2)
    reg.fit(train_x, train_y['registered'])
    pred_registered = reg.predict(test1)
    pred_registered = np.exp(pred_registered) - 1
    pred_registered[pred_registered < 0] = 0
    preds[name] = pred_casual + pred_registered

In [None]:
tmp = pd.DataFrame({'Feature': x_train.columns, 'Feature importance': reg.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (15,15))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)

In [None]:
pred_mean = (count_pred1 + count_pred2 + preds['gbdt'] + preds['rf'])/4
sample = pd.read_csv("../input/bike-sharing-demand/sampleSubmission.csv")
sample["count"] = pred_mean
sample.to_csv("sample.csv",index=False)

Result rmsle is 0.38081