In [97]:
import arrow
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.font_manager import FontManager, FontProperties

# 设置全局字体为中文字体
font_path = '/System/Library/Fonts/PingFang.ttc'
font_prop = FontProperties(fname=font_path)
plt.rcParams['font.family'] = font_prop.get_name()

In [109]:
dtype = {
    '最低温度': float,
    '最高温度': float,
    '雨量等级': int,
}
weather_df = pd.read_csv('./data/北京8月天气.csv', dtype=dtype)
weather_df['温度'] = (weather_df['最低温度'] + weather_df['最高温度']) / 2
weather_df

Unnamed: 0,日期,星期,最高温度,最低温度,天气,风向,风力,雨量等级,温度
0,20230801,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
1,20230802,3,31.0,26.0,多云~晴,西南风,3,0,28.5
2,20230803,4,32.0,26.0,多云~小雨,西南风,2,1,29.0
3,20230804,5,34.0,26.0,多云,东北风,1,0,30.0
4,20230805,6,36.0,25.0,阴~小雨,东北风,2,1,30.5
5,20230806,7,28.0,23.0,阴~多云,南风,2,0,25.5
6,20230807,1,27.0,23.0,阴~小雨,东南风,1,1,25.0
7,20230808,2,31.0,24.0,多云,南风,2,0,27.5
8,20230809,3,33.0,24.0,多云~小雨,东南风,1,1,28.5
9,20230810,4,32.0,23.0,多云~阴,东南风,2,1,27.5


In [155]:
order_df = pd.read_csv('./data/北京市<20230801-20230831>订单.csv')
date_field = 'accepted_at'
order_df['date'] = pd.to_datetime(order_df[date_field])  # 将日期转换为时间格式
order_df = order_df.rename(columns={'shipping_date': 'book_day'})
order_df['day'] = order_df['date'].dt.day
order_df['hour'] = order_df['date'].dt.hour
order_df['weekday'] = order_df['date'].dt.weekday
order_df

Unnamed: 0,book_day,city_code,city_name,store_id,store_name,operator_id,operator_name,order_id,seller_order_id,distance,...,created_at,confirmed_at,accepted_at,arrived_at,pickup_at,done_at,date,day,hour,weekday
0,20230831,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64efec9056602be300212b8f,64efec9056602be300212b89,9014,...,2023-08-31T09:27:44.157000+08:00,2023-08-31T09:27:44.165000+08:00,2023-08-31T09:28:00.992000+08:00,2023-08-31T09:28:08.904000+08:00,2023-08-31T09:28:09.381000+08:00,2023-08-31T09:35:15.123000+08:00,2023-08-31 09:28:00.992000+08:00,31,9,3
1,20230831,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64efece056602be300212bae,64efece056602be300212ba8,9014,...,2023-08-31T09:29:04.429000+08:00,2023-08-31T09:29:04.436000+08:00,2023-08-31T09:29:14.736000+08:00,2023-08-31T09:36:11.947000+08:00,2023-08-31T09:36:13.541000+08:00,2023-08-31T09:36:18.774000+08:00,2023-08-31 09:29:14.736000+08:00,31,9,3
2,20230831,110000,北京市,6482bb5277bb7a754600bc8e,菊花,64642ffc8ec1773349f7a665,张浩,64efed9b56602be300212bcc,64efed9b56602be300212bc6,2594,...,2023-08-31T09:32:11.921000+08:00,2023-08-31T09:32:11.931000+08:00,2023-08-31T09:32:25.376000+08:00,2023-09-05T15:42:11.064000+08:00,2023-09-05T15:42:12.100000+08:00,2023-09-05T15:42:22.603000+08:00,2023-08-31 09:32:25.376000+08:00,31,9,3
3,20230831,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64efee6c56602be300212bf1,64efee6c56602be300212beb,9014,...,2023-08-31T09:35:40.109000+08:00,2023-08-31T09:35:40.119000+08:00,2023-08-31T09:35:40.187000+08:00,2023-08-31T09:36:14.124000+08:00,2023-08-31T09:36:14.544000+08:00,2023-08-31T09:36:25.317000+08:00,2023-08-31 09:35:40.187000+08:00,31,9,3
4,20230831,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64efee7f56602be300212c07,64efee7f56602be300212c01,9014,...,2023-08-31T09:35:59.934000+08:00,2023-08-31T09:35:59.940000+08:00,2023-08-31T09:35:59.991000+08:00,2023-08-31T09:36:15.274000+08:00,2023-08-31T09:36:15.639000+08:00,2023-08-31T09:36:30.525000+08:00,2023-08-31 09:35:59.991000+08:00,31,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,20230801,110000,北京市,649981f552610a12b0b84eeb,小胖子麻辣烫,64c25a625422b52d30f73654,李姝汶,64c8d857addc5156d8b1f080,64c8d857addc5156d8b1f07a,1959,...,2023-08-01T18:03:03.346000+08:00,2023-08-01T18:03:03.354000+08:00,2023-08-01T18:05:06.800000+08:00,2023-08-01T18:09:30.148000+08:00,2023-08-01T18:09:33.305000+08:00,2023-08-01T18:09:34.794000+08:00,2023-08-01 18:05:06.800000+08:00,1,18,1
883,20230801,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64c8dd0eaddc5156d8b1f102,64c8dd0eaddc5156d8b1f0fc,476,...,2023-08-01T18:23:10.512000+08:00,2023-08-01T18:23:10.520000+08:00,2023-08-01T18:27:22.031000+08:00,2023-08-01T18:29:07.056000+08:00,2023-08-01T18:29:07.715000+08:00,2023-08-01T18:29:13.114000+08:00,2023-08-01 18:27:22.031000+08:00,1,18,1
884,20230801,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64c8de99addc5156d8b1f151,64c8de99addc5156d8b1f14b,9014,...,2023-08-01T18:29:45.489000+08:00,2023-08-01T18:29:45.502000+08:00,2023-08-01T18:30:48.017000+08:00,2023-08-01T18:32:40.844000+08:00,2023-08-01T18:32:42.218000+08:00,2023-08-01T18:32:43.955000+08:00,2023-08-01 18:30:48.017000+08:00,1,18,1
885,20230801,110000,北京市,64c89ed2906b82e00415219f,zpf店铺1,64c89f0f906b82e0041521a3,朱鹏飞,64c8dea8addc5156d8b1f166,64c8dea8addc5156d8b1f160,9014,...,2023-08-01T18:30:00.407000+08:00,2023-08-01T18:30:00.416000+08:00,2023-08-01T18:30:51.411000+08:00,2023-08-01T18:32:39.847000+08:00,2023-08-01T18:32:40.325000+08:00,2023-08-01T18:33:57.072000+08:00,2023-08-01 18:30:51.411000+08:00,1,18,1


In [110]:
# 给日期设置对应的颜色，进行区分
days = weather_df['日期']
colors = cm.tab20(np.linspace(0, 0.95, len(days)))
color_map = {day: color for day, color in zip(days, colors)}
color_map

{20230801: array([0.12156863, 0.46666667, 0.70588235, 1.        ]),
 20230802: array([0.12156863, 0.46666667, 0.70588235, 1.        ]),
 20230803: array([0.68235294, 0.78039216, 0.90980392, 1.        ]),
 20230804: array([0.68235294, 0.78039216, 0.90980392, 1.        ]),
 20230805: array([1.        , 0.49803922, 0.05490196, 1.        ]),
 20230806: array([1.        , 0.73333333, 0.47058824, 1.        ]),
 20230807: array([1.        , 0.73333333, 0.47058824, 1.        ]),
 20230808: array([0.17254902, 0.62745098, 0.17254902, 1.        ]),
 20230809: array([0.59607843, 0.8745098 , 0.54117647, 1.        ]),
 20230810: array([0.59607843, 0.8745098 , 0.54117647, 1.        ]),
 20230811: array([0.83921569, 0.15294118, 0.15686275, 1.        ]),
 20230812: array([0.83921569, 0.15294118, 0.15686275, 1.        ]),
 20230813: array([1.        , 0.59607843, 0.58823529, 1.        ]),
 20230814: array([0.58039216, 0.40392157, 0.74117647, 1.        ]),
 20230815: array([0.58039216, 0.40392157, 0.7411

In [77]:
# 按小时统计订单量
hour_order_df = order_df.groupby(['book_day', 'hour']).size().reset_index(name='order_count')
hour_order_df

Unnamed: 0,book_day,hour,order_count
0,20230801,13,2
1,20230801,14,11
2,20230801,15,1
3,20230801,16,7
4,20230801,17,3
...,...,...,...
109,20230831,9,9
110,20230831,10,1
111,20230831,11,1
112,20230831,14,1


In [78]:
# 按天和时间展示单量
day_hour_order_df = hour_order_df.set_index(['book_day', 'hour'])['order_count']
day_hour_order_df = day_hour_order_df.unstack('book_day')
day_hour_order_df = day_hour_order_df.fillna(0)
day_hour_order_df

book_day,20230801,20230802,20230803,20230804,20230807,20230808,20230809,20230810,20230811,20230814,20230815,20230816,20230817,20230818,20230822,20230823,20230828,20230829,20230830,20230831
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,5.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,1.0,0.0,0.0,21.0,0.0,4.0,2.0,9.0
10,0.0,18.0,17.0,3.0,2.0,0.0,0.0,11.0,2.0,10.0,2.0,5.0,0.0,73.0,0.0,52.0,13.0,6.0,0.0,1.0
11,0.0,11.0,12.0,21.0,1.0,0.0,0.0,0.0,0.0,3.0,6.0,2.0,0.0,8.0,0.0,26.0,6.0,1.0,1.0,1.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2.0,6.0,6.0,10.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,1.0,0.0,0.0,0.0,3.0,8.0,11.0,2.0,0.0
14,11.0,30.0,2.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,0.0,1.0,0.0,4.0,15.0,2.0,5.0,1.0
15,1.0,8.0,8.0,8.0,0.0,3.0,0.0,0.0,2.0,39.0,3.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,4.0,4.0
16,7.0,9.0,3.0,14.0,0.0,0.0,1.0,0.0,0.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,8.0,15.0,1.0,0.0
17,3.0,10.0,6.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,9.0,33.0,5.0,0.0,29.0,0.0,4.0,8.0,1.0,0.0


In [81]:
# 按天统计单量
day_order_df = order_df.groupby('book_day').size().reset_index(name='order_count')
day_order_df = day_order_df.rename(columns={'book_day': '日期', 'order_count': '单量'})
day_order_df

Unnamed: 0,日期,单量
0,20230801,38
1,20230802,105
2,20230803,60
3,20230804,69
4,20230807,4
5,20230808,3
6,20230809,1
7,20230810,12
8,20230811,4
9,20230814,83


In [159]:
# 按门店统计单量
store_day_order_df = order_df.groupby(['book_day', 'store_id']).size().reset_index(name='order_count')
store_day_order_df = store_day_order_df.rename(columns={'book_day': '日期', 'store_id': '门店', 'order_count': '单量'})
store_day_order_df

Unnamed: 0,日期,门店,单量
0,20230801,64744c984b18bad22c97c22c,4
1,20230801,649981f552610a12b0b84eeb,5
2,20230801,64c89ed2906b82e00415219f,26
3,20230801,64c8b7fb126e062808ff2703,3
4,20230802,64744c984b18bad22c97c22c,11
...,...,...,...
73,20230830,64c89ed2906b82e00415219f,15
74,20230831,64744c984b18bad22c97c22c,1
75,20230831,6482bb5277bb7a754600bc8e,3
76,20230831,649981f552610a12b0b84eeb,2


In [160]:
store_weather_day_order_df = store_day_order_df.merge(weather_df, on='日期', how='inner')
store_weather_day_order_df

Unnamed: 0,日期,门店,单量,星期,最高温度,最低温度,天气,风向,风力,雨量等级,温度
0,20230801,64744c984b18bad22c97c22c,4,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
1,20230801,649981f552610a12b0b84eeb,5,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
2,20230801,64c89ed2906b82e00415219f,26,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
3,20230801,64c8b7fb126e062808ff2703,3,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
4,20230802,64744c984b18bad22c97c22c,11,3,31.0,26.0,多云~晴,西南风,3,0,28.5
...,...,...,...,...,...,...,...,...,...,...,...
73,20230830,64c89ed2906b82e00415219f,15,3,32.0,20.0,晴,西南风,3,0,26.0
74,20230831,64744c984b18bad22c97c22c,1,4,25.0,24.0,晴,东南风,4,0,24.5
75,20230831,6482bb5277bb7a754600bc8e,3,4,25.0,24.0,晴,东南风,4,0,24.5
76,20230831,649981f552610a12b0b84eeb,2,4,25.0,24.0,晴,东南风,4,0,24.5


In [113]:
# 按日期合并天气数据
df = day_order_df.merge(weather_df, on='日期', how='inner')
df

Unnamed: 0,日期,单量,星期,最高温度,最低温度,天气,风向,风力,雨量等级,温度
0,20230801,38,2,28.0,25.0,中雨~小雨,东南风,2,4,26.5
1,20230802,105,3,31.0,26.0,多云~晴,西南风,3,0,28.5
2,20230803,60,4,32.0,26.0,多云~小雨,西南风,2,1,29.0
3,20230804,69,5,34.0,26.0,多云,东北风,1,0,30.0
4,20230807,4,1,27.0,23.0,阴~小雨,东南风,1,1,25.0
5,20230808,3,2,31.0,24.0,多云,南风,2,0,27.5
6,20230809,1,3,33.0,24.0,多云~小雨,东南风,1,1,28.5
7,20230810,12,4,32.0,23.0,多云~阴,东南风,2,1,27.5
8,20230811,4,5,29.0,23.0,阴~中雨,东南风,1,3,26.0
9,20230814,83,1,35.0,24.0,晴,西南风,1,0,29.5


In [132]:
# 待预测数据
features = ['温度', '雨量等级', '风力', '星期']  # 特征值
target = '单量'  # 目标变量
predict_day = 20230901
temperature = 26
rain_level = 0
wind_speed = 3
week_day = 5
predict_data = {
    '温度': [temperature], 
    '雨量等级': [rain_level], 
    '风力': [wind_speed],
    '星期': [week_day],
}

In [161]:
def decision_tree_predict(df):
    """
    使用决策树分析历史天气对单量的影响
    要素：温度、降雨量、风速
    """
    # 划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2,
                                                        random_state=42)

    # 创建决策树回归模型
    model = DecisionTreeRegressor()
    # 拟合模型
    model.fit(x_train, y_train)
    # 在测试集上进行预测
    y_pred = model.predict(x_test)

    # 评估模型性能
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("均方误差 (MSE):", mse)
    print("回归评分函数 (R2):", r2)

    # 特征重要性分析
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
    print('特征重要性分析')
    print(feature_importance)

    # 使用模型进行未来订单量预测
    predict_df = pd.DataFrame(predict_data)
    predict_result = model.predict(predict_df)
    print("明日单量预测:", predict_result)

print('=' * 50, '总体预测', '=' * 50)
decision_tree_predict(df)

print('=' * 50, '按门店预测', '=' * 50)
predict_data = {
    '温度': [temperature], 
    '雨量等级': [rain_level], 
    '风力': [wind_speed],
    '星期': [week_day],
    '门店': []
}
decision_tree_predict(store_weather_day_order_df)

均方误差 (MSE): 5004.75
回归评分函数 (R2): -4.391597091300835
特征重要性分析
  Feature  Importance
0      温度    0.881255
1    雨量等级    0.001180
2      风力    0.040389
3      星期    0.077176
明日单量预测: [25.]
均方误差 (MSE): 618.3034027777778
回归评分函数 (R2): -0.33147435322267094
特征重要性分析
  Feature  Importance
0      温度    0.294531
1    雨量等级    0.654626
2      风力    0.022230
3      星期    0.028613
明日单量预测: [8.]


In [152]:
def random_forest_predict(df):
    """
    使用随机森林模型进行预测
    """
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

    # 创建随机森林回归模型
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # 设置100个决策树

    # 在训练集上训练模型
    rf_model.fit(X_train, y_train)

    # 在测试集上进行预测
    y_predict = rf_model.predict(X_test)
    print(y_predict)
    
    # 评估模型
    mse = mean_squared_error(y_test, y_predict)
    print(f"均方误差（MSE）: {mse}")

    # 特征重要性分析
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
    print(f'特征重要性分析: {feature_importance}')

    # 特征重要性可视化
    # plt.figure(figsize=(10, 6))
    # sns.barplot(x="Importance", y="Feature", data=feature_importance.sort_values(by="Importance", ascending=False))

    # 使用模型进行未来订单量预测
    predict_df = pd.DataFrame(predict_data)
    predict_result = rf_model.predict(predict_df)
    print("明日单量预测:", predict_result)

random_forest_predict(df)

[19.4  37.1   9.51 20.19]
均方误差（MSE）: 4275.45655
特征重要性分析:   Feature  Importance
0      温度    0.762931
1    雨量等级    0.046678
2      风力    0.068592
3      星期    0.121799
明日单量预测: [21.66]
