# 1. 首先导入所用的库

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno


# 2. 查看数据集

In [None]:
bike_df = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
print(bike_df.shape)
bike_df.head()

In [None]:
bike_ts = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
print(bike_ts.shape)
bike_ts.head()

In [None]:
bike_df.info()

In [None]:
msno.matrix(bike_df) # 查看是否有缺失值

In [None]:
sns.catplot(x='season',data=bike_df,kind='count',height=5,aspect=1.5) # 查看用户数在各个季节的分布

In [None]:
sns.catplot(x='holiday',data=bike_df,kind='count',height=5,aspect=1) # 查看用户数在是否节假日的分布

In [None]:
sns.catplot(x='workingday',data=bike_df,kind='count',height=5,aspect=1) # 查看用户数在是否工作日的分布

In [None]:
sns.catplot(x='weather',data=bike_df,kind='count',height=5,aspect=1.5) # 查看用户数在各天气的分布
# 1: Clear, Few clouds, Partly cloudy, Partly cloudy
# 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

In [None]:
# 相关系数矩阵
cor_mat= bike_df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

# 3. 预处理数据集

## 3.1. 将时间特征datetime拆分为年、月、日、时4个特征

In [None]:
bike_df['datetime'] = bike_df.datetime.apply(pd.to_datetime)

bike_df['year'] = bike_df.datetime.apply(lambda x:x.year)
bike_df['month'] = bike_df.datetime.apply(lambda x:x.month)
bike_df['day'] = bike_df.datetime.apply(lambda x:x.day)
bike_df['hour'] = bike_df.datetime.apply(lambda x:x.hour)

bike_df.head(5)

In [None]:
bike_ts['datetime'] = bike_ts.datetime.apply(pd.to_datetime)

bike_ts['year'] = bike_ts.datetime.apply(lambda x:x.year)
bike_ts['month'] = bike_ts.datetime.apply(lambda x:x.month)
bike_ts['day'] = bike_ts.datetime.apply(lambda x:x.day)
bike_ts['hour'] = bike_ts.datetime.apply(lambda x:x.hour)

bike_ts.head(5)

## 3.2. 删除datetime, casual, registered特征

In [None]:
drop_columns = ['datetime','casual','registered'] #我们只关注在特定条件下的用户总数，对是否注册并无要求
bike_df.drop(drop_columns,axis=1,inplace=True)
bike_df.head(5)

In [None]:
bike_ts.drop('datetime',axis=1,inplace=True)
bike_ts.head(5)

# 4. 训练模型及测试

## 4.1. 定义误差函数

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
#Root Mean Squared Log Error
def rmsle(y,pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y-log_pred)**2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle
    
def rmse(y,pred): return np.sqrt(mean_squared_error(y,pred))

def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    mae_val = mean_absolute_error(y,pred)
    return print('RMSLE: {0:.3f}, RMSE: {1:.3F}, MAE: {2:.3F}'.format(rmsle_val,rmse_val,mae_val))    

## 4.2. 尝试使用线性回归

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

y_target = bike_df['count']
X_features = bike_df.drop(['count'],axis=1,inplace=False)
y_target_log = np.log1p(y_target)
X_train, X_test, y_train, y_test = train_test_split(X_features,y_target,test_size=0.3, random_state=0)

lr_reg = LinearRegression()
lr_reg.fit(X_train,y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test,pred)

In [None]:
result1 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result1.head(5)

In [None]:
result1['count'] = lr_reg.predict(bike_ts)

In [None]:
result1.to_csv('/kaggle/working/result1.csv',index=False) # 在线性回归中，结果存在负数，并不能用于进行评分
result1[result1['count'] < 0]['count'] = 0;
result1.to_csv('/kaggle/working/lr_reg.csv',index=False)# 将负数置零后，得到评分为Score: 3.11841

In [None]:
def get_top_error_data(y_test,pred,n_tops=5):
    result_df = pd.DataFrame(y_test.values, columns = ['real_count'])
    result_df['predicted_count'] = np.round(pred)
    result_df['diff'] = np.abs(result_df['real_count'] - result_df['predicted_count'])
    return print(result_df.sort_values('diff',ascending=False)[:n_tops])

get_top_error_data(y_test,pred,n_tops=5)

In [None]:
coef = pd.Series(lr_reg.coef_, index= X_features.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values,y=coef_sort.index)

## 4.3. 尝试使用独热编码

In [None]:
X_features_ohe = pd.get_dummies(X_features,columns = ['year','month','day','hour','holiday','workingday','season','weather'])
print(X_features_ohe.shape)
X_features_ohe.head(5)

In [None]:
bike_ts_ohe = pd.get_dummies(bike_ts,columns = ['year','month','day','hour','holiday','workingday','season','weather'])
bike_ts_ohe.head(5)

In [None]:
a = list(X_features_ohe.columns)
b = list(bike_ts_ohe.columns)
a.extend(b)
ind = list(set(a))

In [None]:
ptrain = pd.DataFrame(data = np.zeros((X_features_ohe.shape[0], len(ind))), columns = ind)
ptest = pd.DataFrame(data = np.zeros((bike_ts_ohe.shape[0], len(ind))), columns = ind)
ptrain[X_features_ohe.columns] = X_features_ohe
ptest[bike_ts_ohe.columns] = bike_ts_ohe
print(ptrain.shape)
print(ptest.shape)
ptrain.head(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ptrain,y_target_log,test_size=0.3,random_state=0)

def get_model_predict(model,X_train,X_test,y_train,y_test,is_expm1=False):
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    print("###",model.__class__.__name__,"###")
    evaluate_regr(y_test,pred)
    print("\n")
    
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=10)
lasso_reg = Lasso(alpha=0.01)

for model in [lr_reg,ridge_reg,lasso_reg]:
    get_model_predict(model,X_train,X_test,y_train,y_test,is_expm1=True)

In [None]:
result2 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result2['count'] = np.expm1(lr_reg.predict(ptest))
result2.to_csv('/kaggle/working/LinearRegression.csv',index=False)# 得到评分为Score: 0.61926

In [None]:
result3 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result3['count'] = np.expm1(ridge_reg.predict(ptest))
result3.to_csv('/kaggle/working/ridge_reg.csv',index=False)# 得到评分为Score: 0.61678

In [None]:
result4 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result4['count'] = np.expm1(lasso_reg.predict(ptest))
result4.to_csv('/kaggle/working/lasso_reg.csv',index=False)# 得到评分为Score: 0.64221

# 4.4. 使用较为复杂的回归器模型

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg= GradientBoostingRegressor(n_estimators=500)
xgb_reg = XGBRegressor(n_estimators=500)
lgbm_reg = LGBMRegressor(n_estimators =500)

for model in [rf_reg,gbm_reg,xgb_reg,lgbm_reg]:
    get_model_predict(model,X_train.values,X_test.values,y_train.values,y_test.values,is_expm1=True)

In [None]:
result5 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result5['count'] = np.expm1(rf_reg.predict(ptest))
result5.to_csv('/kaggle/working/rf_reg.csv',index=False)# 得到评分为Score: 0.43141

In [None]:
result6 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result6['count'] = np.expm1(gbm_reg.predict(ptest))
result6.to_csv('/kaggle/working/gbm_reg.csv',index=False)# 得到评分为Score: 0.41277

In [None]:
result7 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result7['count'] = np.expm1(xgb_reg.predict(ptest))
result7.to_csv('/kaggle/working/xgb_reg.csv',index=False)# 得到评分为Score: 0.41950

In [None]:
result8 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result8['count'] = np.expm1(lgbm_reg.predict(ptest))
result8.to_csv('/kaggle/working/lgbm_reg.csv',index=False)# 得到评分为Score: 0.41248

## 4.5. 选择随机森林算法进行参数调整

In [None]:
no_of_test=[10, 30, 50, 100, 300,500, 1000]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error')
clf_rf.fit(X_train,y_train)


In [None]:
clf_rf.best_params_

In [None]:
rf1_reg = RandomForestRegressor(n_estimators=1000)
get_model_predict(rf1_reg,X_train.values,X_test.values,y_train.values,y_test.values,is_expm1=True)
result9 = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
result9['count'] = np.expm1(rf1_reg.predict(ptest))
result9.to_csv('/kaggle/working/rf1_reg.csv',index=False)# 得到评分为Score: 0.43100