In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 读取数据

train=pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test=pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")
train.info()
test.info()
# 根据输出数据判断没有空值

In [None]:
numerical_list=['season', 'holiday', 'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']
object_list=["datetime"]

In [None]:
# 考虑到日期与时间会对租用产生影响，这里把年，月，日，几点，季度，该年的哪天，该星期的哪天， 该年的哪个星期也作为特征计算进去
# 故比起原来的8个特征，这里多出8个特征，共16个特征

combine_list=[train,test]  # 将train与test同步处理出上述特征
for combine in combine_list:
    combine["datetime"]=pd.to_datetime(combine["datetime"])
   

In [None]:
import calendar

for combine in combine_list:
    combine["month"]=[i.month for i in combine["datetime"]]
    combine["year"]=[i.year for i in combine["datetime"]]
    combine["day"]=[i.day for i in combine["datetime"]]
    combine["hour"]=[i.hour for i in combine["datetime"]]
    combine["day_of_year"]=[i.day_of_year for i in combine["datetime"]]
    combine["dayofweek"]=[i.dayofweek for i in combine["datetime"]]
    combine["quarter"]=[i.quarter for i in combine["datetime"]]
    combine["weekofyear"]=[i.weekofyear for i in combine["datetime"]]

In [None]:
#风速为0的数据偏多，且有空缺，采用随机森林的方法填充异常值(借鉴自用户SKOG-ART)
wind_0 = train[train['windspeed']==0]
wind_not0 = train[train['windspeed']!=0]
y_label = wind_not0['windspeed']

In [None]:
#猜测风速和天气以及时间都有关
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
windcolunms = ['season', 'weather', 'temp', 'atemp', 'humidity', 'hour', 'month']
model.fit(wind_not0[windcolunms], y_label.astype('int'))
pred_y = model.predict(wind_0[windcolunms])
#预测结果填充
wind_0['windspeed'] = pred_y
train_p = wind_not0.append(wind_0)
train_p.reset_index(inplace=True)
train_p.head()

In [None]:
# 将datetime改为index，不作为特征引入（该特征已经在前面完成拆分）
combine_list=[train_p,test]
for combine in combine_list:
    combine.set_index("datetime",inplace=True)

In [None]:
# 数据可视化处理
import matplotlib.pyplot as plt
import seaborn as sns

fig, axis = plt.subplots(2, 2)
fig.set_size_inches(12, 10)
sns.boxplot(data=train,y="count",orient="v",ax=axis[0][0])
sns.boxplot(data=train,y="count",x="season",orient="v",ax=axis[0][1])
sns.boxplot(data=train,y="count",x="hour",orient="v",ax=axis[1][0])
sns.boxplot(data=train,y="count",x="workingday",orient="v",ax=axis[1][1])
# 使用中文标签时字库中没有该符号，会出现框框，故用英文标签

axis[0][0].set(ylabel="count", title="box plot on count")  # 租赁总数箱线图
axis[0][1].set(xlabel="season", ylabel="count", title="box plot on count across season")  # 租赁总数关于季节的箱线图
axis[1][0].set(xlabel="hours", ylabel="count", title="box plot on count across hours")  # 租赁总数关于时间的箱线图
axis[1][1].set(xlabel="working day", ylabel="count", title="box plot on count across working day")  # 租赁总数关于是否为工作日的箱线图

In [None]:
# 相关系数计算
# 计算除了上述四个特征以外的12个特征与租赁总数的相关系数
corrmat = train[["temp","atemp","humidity","windspeed","month","year","day","hour","day_of_year","dayofweek","quarter","weekofyear","count"]].corr()
m = np.array(corrmat)
m[np.tril_indices_from(m)]=False
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
sns.heatmap(corrmat, mask=m, vmax=0.8, square=True, annot=True)

In [None]:
# 数据处理
from sklearn.model_selection import train_test_split
# 根据上述相关系数，温度与体感温度有较大相关性，故结合计算
train_p["tempp"] = (train_p['temp']+train_p['atemp']) / 2
# 丢弃不必要的数据
train_p.drop(["casual", "registered", 'temp', 'atemp', 'index'], axis=1, inplace=True)
y = train_p["count"]
x = train_p.drop(["count"], axis=1)
# 将训练集82分成训练集与测试集，测试集用于评估模型准确度
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=2000,random_state=42)
model.fit(x_train, y_train)
pred=model.predict(x_test)

In [None]:
# 使用测试集对模型进行评分
model.score(x_test, y_test)

In [None]:
x_train

In [None]:
# 使用训练好的模型对test数据进行预测
test['tempp'] = (test['temp'] + test['atemp']) / 2
test.drop(['temp', 'atemp'], axis=1, inplace=True)
y_pred = model.predict(test)

In [None]:
y_pred

In [None]:
test["count"]=y_pred  # 写入预测值
test.reset_index(inplace=True)
test.loc[test["count"]<=0,"count"]=0  # 将小于0的无效预测值改为0
test[['datetime','count']].to_csv('submission.csv',index=False)

In [None]:
test