In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

In [None]:
%config Completer.use_jedi = False

In [None]:
df = pd.read_csv('../input/bike-sharing-demand/train.csv')
df.head()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
def time_features(df):
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['week'] = df['datetime'].dt.isocalendar().week
    df['date'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    df['week'] = df['week'].astype('int32') 
    return df

In [None]:
def cat_features(df):
    df['season'] = df['season'].map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter"})
    df['weather'] = df['weather'].map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "})
    
    category_features = ['season','holiday','workingday','weather','month','hour','weekday']
    for fea in category_features:
        df[fea] = df[fea].astype('category')
    return df

In [None]:
df = time_features(df)
df = cat_features(df)

In [None]:
df = df.drop(columns=('datetime'))

In [None]:
df.info()

In [None]:
category_features = ['season','holiday','workingday','weather','month','hour','weekday']
for fea in category_features:
    df[fea] = df[fea].astype('category')
df['week'] = df['week'].astype('int32')

In [None]:
df.info()

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12, 10)
sns.boxplot(data=df,y="count",orient="v",ax=axes[0][0])
sns.boxplot(data=df,y="count",x="season",orient="v",ax=axes[0][1])
sns.boxplot(data=df,y="count",x="hour",orient="v",ax=axes[1][0])
sns.boxplot(data=df,y="count",x="workingday",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='Season', ylabel='Count',title="Box Plot On Count Across Season")
axes[1][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")

In [None]:
df = df[np.abs(df["count"]-df["count"].mean())<=(3*df["count"].std())] 

In [None]:
df.shape

In [None]:
corr_df = df.select_dtypes('number').corr()
sns.heatmap(corr_df)

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(df['windspeed'])

## Filling 0's In windspeed Using Random Forest ##

In [None]:
wind_tr = pd.read_csv("../input/bike-sharing-demand/train.csv")
wind_te = pd.read_csv("../input/bike-sharing-demand/test.csv")

In [None]:
wind_tr['datetime'] = pd.to_datetime(wind_tr['datetime'])

In [None]:
wind_tr = time_features(wind_tr)

In [None]:
from sklearn.ensemble import RandomForestRegressor

wind0 = wind_tr[wind_tr['windspeed']==0]
wind1 = wind_tr[wind_tr['windspeed']!=0]
rf_wind = RandomForestRegressor()
wind_col = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity','year', 'month', 'week', 'date', 'hour', 'weekday']
rf_wind.fit(wind1[wind_col],wind1['windspeed'])
rf_wind.score(wind1[wind_col],wind1['windspeed'])

In [None]:
wind_tr.loc[wind_tr[wind_tr['windspeed']==0].index, 'windspeed'] = rf_wind.predict(wind0[wind_col])

In [None]:
df.windspeed = wind_tr.windspeed

# Evaluation metrics

In [None]:
def rmsle(y,y_,converExp=True):
    if converExp:
        y = np.exp(y),
        y_ = np.exp(y_)
    log1=np.nan_to_num(np.array([np.log(v+1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v+1) for v in y_]))
    calc = (log1-log2) ** 2
    return np.sqrt(np.mean(calc))

# Model building

In [None]:
data = pd.read_csv('../input/bike-sharing-demand/train.csv')
test = pd.read_csv('../input/bike-sharing-demand/test.csv')

In [None]:
data['datetime'] = pd.to_datetime(data['datetime'])
data = time_features(data)

In [None]:
category_feature = ['season','holiday','workingday','weather','year','month','week','date','hour','weekday']
remove_feature = ['casual', 'registered', 'count','atemp','datetime']
num_feature = ['temp','humidity']
data['windspeed'] = rf_wind.predict(data[wind_col])

In [None]:
for fea in category_feature:
    data[fea] = data[fea].astype('category')

In [None]:
X = data.drop(remove_feature,axis=1)
y = np.log1p(data['count'])

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV

lModel = LinearRegression()
lModel.fit(X_train,y_train)
pred = lModel.predict(X_test)
print('RMSLE value for linear regression is ',rmsle(y_test,pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
print('RMSLE value for linear regression is ',rmsle(y_test,pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01); ### Test 0.41
gbm.fit(X_train,y_train)
preds = gbm.predict(X_test)
print ("RMSLE Value For Gradient Boost: ",rmsle(y_test,pred))

In [None]:
import xgboost as xgb
params = {
   
    'max_depth': 7,
    'gamma' :0,
    'eta':.03, 
    'subsample': 1,
    'colsample_bytree': 0.9, 
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}
def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train, enable_categorical=True)
    matrix_test = xgb.DMatrix(X_test,label=y_test, enable_categorical=True)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

In [None]:
time = test.datetime

In [None]:
test['datetime'] = pd.to_datetime(test['datetime'])
test = time_features(test)
test['windspeed'] = rf_wind.predict(test[wind_col])
for fea in category_feature:
    test[fea] = test[fea].astype('category')
test = test.drop(['datetime','atemp'],axis=1)

In [None]:
prediction = model.predict(xgb.DMatrix(test,enable_categorical=True))

In [None]:
prediction = np.exp(prediction)

In [None]:
sub = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')

In [None]:
sub.datetime = time
sub.count = prediction

In [None]:
sub.to_csv('xgb_bike_sharing_inv_log.csv',index=False)

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sns.distplot(np.exp(y_train),ax=ax1,bins=50,label='Train')
sns.distplot(prediction,ax=ax2,bins=50,label='pred')

In [None]:
(prediction <= 1).sum()

In [None]:
sns.kdeplot(x=prediction)

In [None]:
prediction

In [None]:
[max(0, x) for x in prediction]