In [1]:
import numpy as np
import pandas as pd

# 读入整理后的数据

In [2]:
data = pd.read_csv('jinan.csv',encoding='ANSI')
data.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,下期政策,价位
0,30,8,100%,67%,100%,100%,8.0,8,583.0
1,29,8,84%,63%,73%,87%,6.7,8,583.0
2,28,8,95%,38%,55%,74%,5.8,8,583.0
3,27,7,91%,28%,55%,77%,5.0,7,583.0
4,26,7,95%,21%,47%,73%,4.5,7,583.0


In [3]:
data1 = data.copy()

删去带‘-’的无效数据，目前认为对这些档位不做调整

In [4]:
data_clean = data1[~data1['订足面'].isin(['-'])]
data_clean.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,下期政策,价位
0,30,8,100%,67%,100%,100%,8.0,8,583.0
1,29,8,84%,63%,73%,87%,6.7,8,583.0
2,28,8,95%,38%,55%,74%,5.8,8,583.0
3,27,7,91%,28%,55%,77%,5.0,7,583.0
4,26,7,95%,21%,47%,73%,4.5,7,583.0


In [5]:
data_clean['政策变动'] = data_clean['下期政策']-data_clean['本期政策']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [6]:
data_clean.head()

Unnamed: 0,档位,本期政策,需求满足率,订购率,订足面,订足率(济南),户均条数,下期政策,价位,政策变动
0,30,8,100%,67%,100%,100%,8.0,8,583.0,0
1,29,8,84%,63%,73%,87%,6.7,8,583.0,0
2,28,8,95%,38%,55%,74%,5.8,8,583.0,0
3,27,7,91%,28%,55%,77%,5.0,7,583.0,0
4,26,7,95%,21%,47%,73%,4.5,7,583.0,0


构建机器学习模型，自变量为'档位','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位'，因变量为'政策变动'

In [7]:
df = data_clean[['档位','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位','政策变动']]
df.head()

Unnamed: 0,档位,需求满足率,订购率,订足面,订足率(济南),户均条数,价位,政策变动
0,30,100%,67%,100%,100%,8.0,583.0,0
1,29,84%,63%,73%,87%,6.7,583.0,0
2,28,95%,38%,55%,74%,5.8,583.0,0
3,27,91%,28%,55%,77%,5.0,583.0,0
4,26,95%,21%,47%,73%,4.5,583.0,0


# 数据分析模块（决策树）

In [8]:
df.shape

(685, 8)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer #特征转换器
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree

In [10]:
X = df[['档位','需求满足率','订购率','订足面','订足率(济南)','户均条数','价位']]
y = df['政策变动']

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)  # 将数据进行分割

In [12]:
vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))   #对训练数据的特征进行提取
X_test = vec.transform(X_test.to_dict(orient='record'))         #对测试数据的特征进行提取

In [13]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_predict = dtc.predict(X_test)

In [14]:
print('Accracy:',dtc.score(X_test,y_test))
print(classification_report(y_predict,y_test))

Accracy: 0.8197674418604651
              precision    recall  f1-score   support

          -3       1.00      1.00      1.00         3
          -2       1.00      1.00      1.00         1
          -1       0.36      0.44      0.40         9
           0       0.92      0.86      0.89       141
           1       0.47      0.67      0.55        12
           2       0.44      0.67      0.53         6

    accuracy                           0.82       172
   macro avg       0.70      0.77      0.73       172
weighted avg       0.85      0.82      0.83       172



In [15]:
y_predict

array([ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  1,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -2,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  0,  2,  0,  0,  0,  2,  0,  1,  1,  0,  1,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0, -3,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -3,  0,  0,  0,  0,  0,
       -1,  0,  0, -3,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0, -1,  0,
        0,  0,  0,  0, -1,  0,  0, -1,  0,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0], dtype=int64)

In [17]:
y_test.values

array([ 0,  2, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
        0,  0,  0,  0,  0, -1,  0,  1,  0,  0,  0,  2,  0,  0,  2,  0,  2,
        0,  0,  2,  0,  0, -1,  0,  0,  0,  0, -2,  0,  0,  0,  1,  0,  0,
        0, -1, -1,  0,  0,  2,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  1,  2,  0,  2,  0,  0,  0, -3,  0,
        0,  0, -1,  2,  0,  0,  0,  0,  0,  1,  1,  0,  0,  1,  0,  0,  0,
        1,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0, -3,  1,  0,  0,  0,  0,
        0,  0,  0, -3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  1,  0,
        0,  0], dtype=int64)

In [19]:
with open("jueceshu.dot", 'w') as f:
    f = tree.export_graphviz(dtc, out_file = f)

In [28]:
import graphviz 
dot_data = tree.export_graphviz(dtc, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("ok")

ExecutableNotFound: failed to execute ['dot', '-Tpdf', '-O', 'ok'], make sure the Graphviz executables are on your systems' PATH