In [2]:
import numpy as np
import pandas as pd

# 读入整理后的数据

In [10]:
data = pd.read_excel('../data/jinan.xlsx')
data.replace('-', 0, inplace=True) # 缺失值以0填补
data.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,下期政策,价位
0,30,8,1.0,0.67,1.0,1.0,8.0,8,583.0
1,29,8,0.84,0.63,0.73,0.87,6.7,8,583.0
2,28,8,0.95,0.38,0.55,0.74,5.8,8,583.0
3,27,7,0.91,0.28,0.55,0.77,5.0,7,583.0
4,26,7,0.95,0.21,0.47,0.73,4.5,7,583.0


In [11]:
data['政策变动'] = data['下期政策'] - data['本期政策']
data.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,下期政策,价位,政策变动
0,30,8,1.0,0.67,1.0,1.0,8.0,8,583.0,0
1,29,8,0.84,0.63,0.73,0.87,6.7,8,583.0,0
2,28,8,0.95,0.38,0.55,0.74,5.8,8,583.0,0
3,27,7,0.91,0.28,0.55,0.77,5.0,7,583.0,0
4,26,7,0.95,0.21,0.47,0.73,4.5,7,583.0,0


Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,价位,政策变动
0,30,8,1.0,0.67,1.0,1.0,8.0,583.0,0
1,29,8,0.84,0.63,0.73,0.87,6.7,583.0,0
2,28,8,0.95,0.38,0.55,0.74,5.8,583.0,0
3,27,7,0.91,0.28,0.55,0.77,5.0,583.0,0
4,26,7,0.95,0.21,0.47,0.73,4.5,583.0,0


## 构建机器学习决策树模型

自变量：'档位','本期政策','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位'

因变量：'政策变动'

每一条观测代表“本投放周期中，某个品规在某个档位上的市场反馈等信息”，根据自变量特征，模型生成下一个投放周期中该品规在该档位上的投放调整量

配置：CART(分类树), 分裂准则：gini, 完全生长不剪枝

In [15]:
df = data[['档位','本期政策','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位','政策变动']]
df.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,价位,政策变动
0,30,8,1.0,0.67,1.0,1.0,8.0,583.0,0
1,29,8,0.84,0.63,0.73,0.87,6.7,583.0,0
2,28,8,0.95,0.38,0.55,0.74,5.8,583.0,0
3,27,7,0.91,0.28,0.55,0.77,5.0,583.0,0
4,26,7,0.95,0.21,0.47,0.73,4.5,583.0,0


In [16]:
df.shape

(900, 9)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer #特征转换器
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree

In [18]:
X = df[['档位','本期政策','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位']]
y = df['政策变动']

In [34]:
# 留出法分割：75%训练集，25%测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)  

In [54]:
dtc = DecisionTreeClassifier(random_state=0)
dtc

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [56]:
dtc.fit(X_train,y_train)
y_predict = dtc.predict(X_test)

print('Accuracy: %.4f\n'%dtc.score(X_test,y_test))
print(classification_report(y_predict,y_test)) # 注：support是y_predict的标签频率统计

Accuracy: 0.9422

              precision    recall  f1-score   support

          -3       1.00      1.00      1.00         2
          -2       1.00      1.00      1.00         3
          -1       0.67      0.89      0.76         9
           0       0.97      0.97      0.97       173
           1       0.82      0.74      0.78        19
           2       1.00      0.94      0.97        17
           4       1.00      1.00      1.00         2

   micro avg       0.94      0.94      0.94       225
   macro avg       0.92      0.93      0.92       225
weighted avg       0.95      0.94      0.94       225



In [58]:
# 特征重要程度 Gini importance
# (normalized) total reduction of the gini criterion brought by that feature
dtc.feature_importances_

array([0.19750356, 0.11066863, 0.07132867, 0.13075512, 0.01327509,
       0.0492703 , 0.05229241, 0.37490623])

## 预期下一步：

1.类比不平衡问题，虽然目前的测试集上表现尚可

2.剪枝，防止无法泛化

3.从树中归纳规则

In [59]:
y.value_counts() # 类别不平衡

 0    700
 1    108
 2     38
-1     29
-2     10
-3     10
 4      5
Name: 政策变动, dtype: int64