In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 数据预处理

### 导入训练集查看缺失值

In [None]:
# 为便于处理datetime，在构建时处理为date类型
train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv", parse_dates = ["datetime"])
train.info()

未发现缺失值可直接使用

### 导入训练集查看缺失值

In [None]:
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv", parse_dates = ["datetime"])
test.info()

未发现缺失值可直接使用

### 检查是否有误差较大的值

In [None]:
#观察租赁额密度分布
figure = plt.figure()
ax = figure.add_subplot(1, 1, 1)
figure.set_size_inches(6,5)
ax.set(xlabel='count',title='Count Distribution')
sns.distplot(train['count'])

可以看出数据分布偏移过大，存在较大的数据。因此需删除此部分数据

In [None]:
# 除去3个标准差以外的数据
train = train[np.abs(train['count']-train['count'].mean()) <= (3*train['count'].std())]
train.shape

### 将datetime数据分开

In [None]:
# 将datetime划分为年月日
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["dayofweek"] = train["datetime"].dt.dayofweek

### 打印热力图，观察数据之间的相关性

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train.corr("pearson"),
            vmin=-1, vmax=1,
            cmap='coolwarm',
            annot=True, 
            square=True)

将day转换为dayofweek，从30个类型转化为7个类型。\
month与season相似度较高大致相同可以用season预测来代替month并将month的12个类型化简为4个。\
对于使用casual和registered来表示预测结果和直接接使用count来表示预测结果的取舍。\
从上图可以看出，不同特征对于casual和registered的影响不同，因此可推测使用casual和registered\
来表示预测结果要比使用count来表示预测结果的拟合效果要好，因此本次使用casual和registered来表示。将day转换为dayofweek，从30个类型转化为7个类型。\

### test中的datetime数据仅需分出year，hour，dayofweek即可

In [None]:
test["year"] = test["datetime"].dt.year
test["hour"] = test["datetime"].dt.hour
test["dayofweek"] = test["datetime"].dt.dayofweek

### 对数据取对数，进一步缩小差异

In [None]:
y_count = np.log1p(train["count"])
y_casual = np.log1p(train.casual)
y_registered = np.log1p(train.registered)

### 删除训练集和测试集中重复和不需要的参数

In [None]:
train.drop(["datetime","month", "day", "casual", "registered", "count"], 1, inplace=True)
test.drop(["datetime" ], 1, inplace=True)

## 训练和预测模型

### 使用LGBM算法预测模型

In [None]:
from lightgbm import LGBMRegressor
model = LGBMRegressor()

# 分别对casual和registered进行预测

model.fit(train,y_count)
pred = model.predict(test)

model.fit(train,y_casual)
pred1 = model.predict(test)

model.fit(train,y_registered)
pred2 = model.predict(test)

### 结果保存到文件

In [None]:
sub = pd.read_csv("/kaggle/input/bike-sharing-demand/sampleSubmission.csv")
sub["count"] = 0.95 * (np.expm1(pred1) + np.expm1(pred2)) +0.05 * np.expm1(pred)
sub.to_csv("result.csv", index=False)