In [15]:
# coding: utf-8
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


data = pd.read_excel('data5/consumption_data.xls', index_col='Id')
data_zs = 1.0 * (data - data.mean()) / data.std()
k = 3
# 离散点阀值
threshold = 2
interation = 500

model = KMeans(n_clusters=k, n_jobs=4, max_iter=interation)
model.fit(data_zs)
r = pd.concat([data_zs, pd.Series(model.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + [u'聚类类别']

norm = []
for i in range(k):
    norm_tmp = r[['R', 'F', 'M']][r[u'聚类类别'] == 1] - model.cluster_centers_[i]
    # 求绝对距离
    norm_tmp = norm_tmp.apply(np.linalg.norm, axis=1)
    norm.append(norm_tmp/norm_tmp.median())
norm = pd.concat(norm)

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
norm[norm <= threshold].plot(style='go')  # 正常点
discrete_points = norm[norm > threshold]  # 离散点
discrete_points.plot(style='ro')

for i in range(len(discrete_points)):
    id = discrete_points.index[i]
    n = discrete_points.iloc[i]
    plt.annotate('(%s, %.2f)' % (id, n), xy=(id, n), xytext=(id, n))
plt.xlabel(u'编号')
plt.ylabel(u'相对距离')
plt.show()

In [33]:
# coding: utf-8
"""拉格朗日插值法"""
import pandas as pd
from scipy.interpolate import lagrange


data = pd.read_excel('data6/missing_data.xls', header=None)


def insert_values(s, n, k=5):
    y = s[list(range(n-k, n)) + list(range(n+1, n+k))]
    y = y[y.notnull()]
    return lagrange(y.index, list(y))(n)


for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:
            data[i][j] = insert_values(data[i], j)
print(data)

             0           1           2
0   235.833300  324.034300  478.323100
1   236.270800  325.637900  515.456400
2   238.052100  328.089700  517.090900
3   235.906300  234.760314  514.890000
4   236.760400  268.832400  498.853908
5   237.672503  404.048000  486.091200
6   237.416700  391.265200  516.233000
7   238.656300  380.824100  507.188091
8   237.604200  388.023000  435.350800
9   238.031300  206.434900  487.675000
10  235.072900  206.544787  610.560305
11  235.531300  400.078700  660.234700
12  235.743740  411.206900  621.234600
13  234.468800  395.234300  611.340800
14  235.500000  344.822100  643.086300
15  235.635400  385.643200  642.348200
16  234.552100  401.623400  618.197198
17  236.000000  409.648900  602.934700
18  235.239600  416.879500  589.345700
19  235.489600  420.748600  556.345200
20  236.968800  408.963200  538.347000
