In [None]:
import numpy as np
import pandas as pd

#导入数据
original_dataset=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
train=original_dataset.copy()

original_dataset_test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
test = original_dataset_test.copy()

#合并数据，用相同的方法预处理。
df = train.append(test, ignore_index=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import calendar
from datetime import datetime
#将datetime分拆为year年，month月，date日，hour时，weekday星期
df['date']=df['datetime'].apply(lambda x:x.split()[0])
df['weekday']=df['date'].apply(lambda x:calendar.day_name[datetime.strptime(x,"%Y-%m-%d").weekday()])

df['datetime'] = df['datetime'].astype('datetime64')
df['year']=df['datetime'].dt.year
df['month']=df['datetime'].dt.month
df['date']=df['datetime'].dt.day
df['hour']=df['datetime'].dt.hour

df = df[['datetime','year','month','date','hour', 'weekday','season', 'holiday', 
         'workingday', 'weather', 'temp', 'atemp',
        'humidity', 'windspeed','casual','registered','count']]

categoryList=['season','holiday','workingday','weather','month','hour']
for i in categoryList:
    df[i]=df[i].astype("int64")

#weekday转化为数字
weekday_dict={"Monday":1,"Tuesday":2,"Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6,"Sunday":7}
df['weekday']=df['weekday'].map(weekday_dict)   

#再次查看数据
df.head()


In [None]:
#重点，在之前的模型中，只使用了原始数据，weekspeed特征有大量空值，影响了模型的性能。
#采用了其他方案的解决方法，利用原始数据的其他特征预测weekspeed，结果发现，比原有模型性能好。

from sklearn.ensemble import RandomForestClassifier

wind_0 = df[df['windspeed']==0]
wind_not0 = df[df['windspeed']!=0]
y_label = wind_not0['windspeed']

#猜测风速和天气以及时间都有关
model = RandomForestClassifier()
windcolunms = ['season', 'weather', 'temp', 'atemp', 'humidity', 'hour', 'month']
model.fit(wind_not0[windcolunms], y_label.astype('int'))
pred_y = model.predict(wind_0[windcolunms])

#预测结果填充
wind_0['windspeed'] = pred_y
df_rfw = wind_not0.append(wind_0)
df_rfw.reset_index(inplace=True)
df_rfw.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
df_rfw = df_rfw.drop('index', axis=1)
#查看处理后的风速情况
f, ax = plt.subplots(figsize=(8,5))
sns.distplot(df_rfw['windspeed'], ax=ax)
ax.set_title('Distribution of handled windspeed')


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df = df_rfw.copy()

In [None]:
#数据eda

import matplotlib.pyplot as plt
import seaborn as sns

fig,axes=plt.subplots(5,2)
fig.set_size_inches(10,8)


#统一绘制箱型图

#季节，夏季和秋季使用次数更多
ax1=sns.boxplot(data=df,y='count',x='season',orient='v',ax=axes[0][0])
#假期，差不多
ax2=sns.boxplot(data=df,y='count',x='holiday',orient='v',ax=axes[0][1])
#工作日，差不多
ax3=sns.boxplot(data=df,y='count',x='workingday',orient='v',ax=axes[1][0])
#天气，天气越糟，使用共享单车的次数越小
ax4=sns.boxplot(data=df,y='count',x='weather',orient='v',ax=axes[1][1])
#年，2012年的使用次数更多
ax5=sns.boxplot(data=df,y='count',x='year',orient='v',ax=axes[2][0])
#月，和季节类似，夏季和秋季使用次数更多
ax6=sns.boxplot(data=df,y='count',x='month',orient='v',ax=axes[2][1])
#日，差不多
ax7=sns.boxplot(data=df,y='count',x='date',orient='v',ax=axes[3][0])
#时，白天更多，且集中于早晚高峰
ax8=sns.boxplot(data=df,y='count',x='hour',orient='v',ax=axes[3][1])
#星期，差不多
ax9=sns.boxplot(data=df,y='count',x='weekday',orient='v',ax=axes[4][0])
#总体情况，总体较集中，有一些离群点
ax10=sns.boxplot(y="count",data=df,orient="v",ax=axes[4][1])


In [None]:
fig,axes=plt.subplots(2,2)
fig.set_size_inches(10,8)

#温度，温度越高，使用次数更多
ax1=sns.regplot(x='temp',y='count',data=df,ax=axes[0][0])
#体感温度，类似。
ax2=sns.regplot(x='atemp',y='count',data=df,ax=axes[0][1])
#湿度，湿度越高，使用次数越低。
ax3=sns.regplot(x='humidity',y='count',data=df,ax=axes[1][0])
#风速，风速越快，使用次数越高。
ax4=sns.regplot(x='windspeed',y='count',data=df,ax=axes[1][1])

In [None]:
#相关矩阵
corrMatrix=df.corr()

#可视化
mask=np.array(corrMatrix)
mask[np.tril_indices_from(mask)] = False
fig,ax=plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatrix,mask=mask,vmax=0.8,square=True,annot=True,)

#看最后一行，year、month、hour、season、weather、temp、atemp、humidity，windspeed和使用次数相关较高。

In [None]:
#对分类数据做one-hot编码
season_dummy=pd.get_dummies(df['season'],prefix='season')
weather_dummy=pd.get_dummies(df['weather'],prefix='weather')
month_dummy=pd.get_dummies(df['month'],prefix='month')
df = pd.concat([df,season_dummy,weather_dummy,month_dummy],axis=1)


In [None]:
#数据预处理完成，分离训练集与测试集
df_train=df[df['count'].notnull()].sort_values('datetime',ascending=True)
df_test=df[df['count'].isnull()].sort_values('datetime',ascending=True)

count=df_train['count']
#one-hot编码后，去除原有特征。atemp与temp相关度过高，只保留temp。
drop_columns=['datetime','casual','registered','count','atemp','season','weather','month']
df_train=df_train.drop(columns=drop_columns,axis=1)
df_test=df_test.drop(columns=drop_columns,axis=1)

df_train.head()


In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 在训练集中划分验证集
x_train,x_test,y_train,y_test = train_test_split(df_train,count,test_size=.2, random_state=2)

# 特征缩放-标准化
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# 选择超参数
MAE_list =[]
print('The program is finding the best number for trees in the model. Please wait.')
for i in range(50,1050,50):
    regressor = RandomForestRegressor(n_estimators = i, random_state = 42)
    regressor.fit(x_train,y_train)
    y_predicted = regressor.predict(x_test)
    MAE_list.append(metrics.mean_absolute_error(y_test,y_predicted))
    print("({}% of the program completed)".format(100*i//1000))

In [None]:
#根据在训练集上的性能选择合适参数
plt.figure(figsize=(10,6))
plt.plot(range(50,1050,50),MAE_list,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('MAE vs. number of estimators(trees)')
plt.xlabel('Trees')
plt.ylabel('MAE')

In [None]:
#做出预测
from sklearn.ensemble import RandomForestRegressor

sc = StandardScaler()
df_train = sc.fit_transform(df_train)
df_test = sc.transform(df_test)

model =RandomForestRegressor(n_estimators=350,random_state=42)
model.fit(df_train,count)
pred=model.predict(df_test)

In [None]:
#保存结果
submission = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
submission['count'] = pred

submission.to_csv(path_or_buf='atlast.csv', index=False)