In [5]:
# coding: utf-8
"""数据规范化代码"""
import pandas as pd
import numpy as np


data = pd.read_excel('data4/normalization_data.xls', header=None)
print(data)
# 最小-最大规范化
print((data - data.min()) / (data.max() - data.min()))
# 零-均值规范化
print((data - data.mean()) / data.std())
# 小数定标规范化
print(data/10**np.ceil(np.log10(data.abs().max())))

     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571
          0         1         2         3
0  0.074380  0.937291  0.923520  1.000000
1  0.619835  0.000000  0.000000  0.850941
2  0.214876  0.119565  0.813322  0.000000
3  0.000000  1.000000  1.000000  0.563676
4  1.000000  0.942308  0.996711  0.804149
5  0.264463  0.838629  0.814967  0.909310
6  0.636364  0.846990  0.786184  0.929571
          0         1         2         3
0 -0.905383  0.635863  0.464531  0.798149
1  0.604678 -1.587675 -2.193167  0.369390
2 -0.516428 -1.304030  0.147406 -2.078279
3 -1.111301  0.784628  0.684625 -0.456906
4  1.657146  0.647765  0.675159  0.234796
5 -0.379150  0.401807  0.152139  0.537286
6  0.650438  0.421642  0.069308  0.595564
       0      1      2       3
0  0.078  0.521  0.602  0.2863
1  0.144 -0.600 -0.521  0.2245
2  0.095 -0.457  0.468 -0.1283
3  0.069  0.596  0.6

In [49]:
# coding: utf-8
"""数据离散化代码"""
import pandas as pd
from sklearn.cluster import KMeans


data = pd.read_excel('data4/discretization_data.xls')
data = data[u'肝气郁结证型系数'].copy()
print(type(data))
k = 4

# 等宽离散化,各个类比一次命名为0,1,2,3
d1 = pd.cut(data, k, labels=range(k))

# 等频离散化
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles=w)[4:4+k+1]
w[0] = w[0] * (1-1e-10)
d2 = pd.cut(data, w, labels=range(k))

# 建立模型，n_jobs是并行数，一般代表CPU数
kmodel = KMeans(n_clusters=k, n_jobs=4)
# 训练模型
# print(data.as_matrix())
kmodel.fit(data.values.reshape((len(data), 1)))
# 输出聚类中心，并且排序（默认是随机的）
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(by=[0])
# 相邻两项求中点，作为边界点
w = c.rolling(2).mean().iloc[1:]
# 把首末边界点加上
w = [0] + list(w[0]) + [data.max()]
d3 = pd.cut(data, w, labels=range(k))


# 自定义作图函数来显示聚类结果
def cluster_plot(d, k):
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    plt.figure(figsize=(8, 3))
    for j in range(0, k):
        plt.plot(data[d==j], [j for i in d[d==j]], 'o')
    plt.ylim(-0.5, k-0.5)
    return plt


cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()

<class 'pandas.core.series.Series'>


In [53]:
# coding: utf-8
from scipy.io import loadmat
import pywt

mat = loadmat('data4/leleccum.mat')
print(mat)
signal = mat['leleccum'][0]
print(signal)
result = pywt.wavedec(signal, 'bior3.7', level=5)
print(len(result))


{'leleccum': array([[420.20278994, 423.52653517, 423.52271225, ..., 323.96580997,
        323.2400761 , 323.85476049]])}
[420.20278994 423.52653517 423.52271225 ... 323.96580997 323.2400761
 323.85476049]
6


In [58]:
# coding: utf-8
import pandas as pd
from sklearn.decomposition import PCA


data = pd.read_excel('data4/principal_component.xls', header=None)
out_file = 'result1.xls'

pca = PCA()
pca.fit(data)
print(pca.components_)
print(pca.explained_variance_ratio_)

pca1 = PCA(3)
pca1.fit(data)
low_d = pca1.transform(data)
pd.DataFrame(low_d).to_excel(out_file)
print(pca1.inverse_transform(low_d))

[[ 0.56788461  0.2280431   0.23281436  0.22427336  0.3358618   0.43679539
   0.03861081  0.46466998]
 [ 0.64801531  0.24732373 -0.17085432 -0.2089819  -0.36050922 -0.55908747
   0.00186891  0.05910423]
 [-0.45139763  0.23802089 -0.17685792 -0.11843804 -0.05173347 -0.20091919
  -0.00124421  0.80699041]
 [-0.19404741  0.9021939  -0.00730164 -0.01424541  0.03106289  0.12563004
   0.11152105 -0.3448924 ]
 [-0.06133747 -0.03383817  0.12652433  0.64325682 -0.3896425  -0.10681901
   0.63233277  0.04720838]
 [ 0.02579655 -0.06678747  0.12816343 -0.57023937 -0.52642373  0.52280144
   0.31167833  0.0754221 ]
 [-0.03800378  0.09520111  0.15593386  0.34300352 -0.56640021  0.18985251
  -0.69902952  0.04505823]
 [-0.10147399  0.03937889  0.91023327 -0.18760016  0.06193777 -0.34598258
  -0.02090066  0.02137393]]
[7.74011263e-01 1.56949443e-01 4.27594216e-02 2.40659228e-02
 1.50278048e-03 4.10990447e-04 2.07718405e-04 9.24594471e-05]
[[41.81945026 17.92938537  7.42743613  6.38423781  7.51911186  7.955