In [12]:
import pywt
import pandas as pd
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

In [13]:
# 读取 excel 文件，默认返回第一张表 
# 返回类型：<class 'pandas.core.frame.DataFrame'>
def get_data(file):
    sheet = pd.read_excel(io=file)
    return sheet

# 异常值处理：——拉依达法则：数据偏差大于三倍标准差剔除
def remove_outlier(x, y):
    mean = np.mean(y)  # 平均值
    std = np.std(y)  # 标准差
    
    lower_limit = mean-3*std  # 最小值
    upper_limit = mean+3*std  # 最大值
    
    for i in range(y.shape[0]):
        if y[i]<lower_limit or y[i]>upper_limit:
            x = x.drop(i)
            y = y.drop(i)
    return x, y

# 小波去噪
# lv为分解层数；data为最后保存的dataframe便于作图；
# index_list为待处理序列；wavefunc为选取的小波函数；
# m,n则选择了进行阈值处理的小波系数层数
def wt(index_list,wavefunc,lv,m,n):   
   
    # 按 level 层分解，使用pywt包进行计算， cAn是尺度系数 cDn为小波系数
    coeff = pywt.wavedec(index_list,wavefunc,mode='sym',level=lv)   

    sgn = lambda x: 1 if x > 0 else -1 if x < 0 else 0 # sgn函数

    # 去噪过程
    for i in range(m,n+1):   # 选取小波系数层数为 m~n层，尺度系数不需要处理
        cD = coeff[i]
        for j in range(len(cD)):
            Tr = np.sqrt(2*np.log(len(cD)))  # 计算阈值
            if cD[j] >= Tr:
                coeff[i][j] = sgn(cD[j]) - Tr  # 向零收缩
            else:
                coeff[i][j] = 0   # 低于阈值置零

    # 重构
    denoised_index = pywt.waverec(coeff,wavefunc)
    return denoised_index[1:]

# # 输出模型预测率 并写入日志文件
# def write_log(svr_model, x_test, y_test, excel_file):

#     pre_rate = svr_model.score(x_test, y_test)
#     print('决定系数：%.4f' % pre_rate)

#     y_pred = svr_rbf.predict(x_test)
#     mse = mean_squared_error(y_test, y_pred)
#     print('均方误差：%.4f' % mse) 

#     log_file = '/home/solejay/program/undergrauduate_project/log.txt'
#     with open(log_file, 'a') as f:
#         s0 = '预测率：%.4f' % pre_rate + '\n'
#         s1 = '均方误差：%.4f' % mse + '\n'
#         s2 = '读取文件：' + excel_file.split('/')[-1] + '\n'
#         s3 = '模型参数：' + str(svr_model) + '\n'
#         s4 = '=============================================================\n'
#         s = s0 + s1 + s2 + s3 + s4
#         f.write(s)


# # 画出预测值和实际值的图像
# def plot_graph(svr_model, x_test, y_test):
#     sample = [i for i in range(1, len(y_test)+1)]
#     sample = np.reshape(sample, (len(sample), 1))
    
#     y_pred = svr_rbf.predict(x_test)

#     plt.plot(sample, y_test, color='black', label='y_test')
#     plt.plot(sample, y_pred, color='red', label='y_pred')

#     plt.xlabel('sample')
#     plt.ylabel('utilization')
#     plt.title('Support Vector Regression')
#     plt.legend()
#     plt.show()

In [14]:
excel_file = '/home/solejay/program/undergrauduate_project/excel/data.xlsx'
data = get_data(excel_file)

x = data.iloc[:, 0:5]
y = data.iloc[:, 5]

In [15]:
x_train = preprocessing.scale(x_train)
x_test = preprocessing.scale(x_test)

In [16]:
x_train

array([[ 1.68340298,  1.36241089,  1.1955401 ,  2.63392493, -1.79923514],
       [ 0.07104101, -0.22964021,  1.19106378,  2.47035563, -1.57167567],
       [ 1.19379927,  0.81539476,  1.22390153,  2.27797333, -1.36388341],
       ...,
       [ 1.38441569,  1.64079213, -0.14722751,  0.25843711, -0.54672646],
       [ 1.29678684,  1.42987375, -0.14862431,  0.20482043, -1.08527042],
       [ 1.1791301 ,  1.29570569,  0.18503699,  0.18665436, -0.74854689]])

In [31]:
# 核函数
svr_rbf = SVR(kernel='rbf', C=1, gamma='auto')
svr_rbf.fit(x_train, y_train)

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [7]:
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33)


In [10]:
# 小波去噪
for i in range(5):
    x_train.iloc[:, i] = wt(x_train.iloc[:, i], 'db4', 4, 1, 4)
    x_test.iloc[:, i] = wt(x_test.iloc[:, i], 'db4', 4, 1, 4)
y_train = wt(y_train, 'db4', 4, 1, 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
  "boundary effects.").format(level))


In [32]:
RR = svr_rbf.score(x_test, y_test)  # 决定系数
print('决定系数：%.4f' % RR)

y_pred = svr_rbf.predict(x_test)  # 预测值
mse = mean_squared_error(y_test, y_pred)  # 均方误差
print('均方误差：%.4f' % mse)

决定系数：0.0312
均方误差：0.2127


In [11]:
x

Unnamed: 0,风压kPa,顶压,风温,O2_FYL,TT 顶温
0,400.24,235.98,1229.85,2.55,167.25
1,406.05,236.11,1230.12,2.55,173.09
2,407.38,236.04,1229.66,2.54,171.97
3,405.49,236.19,1229.97,2.55,181.01
4,407.02,236.26,1229.83,2.54,184.52
5,408.58,236.01,1230.15,2.53,185.29
6,411.44,236.02,1229.66,2.53,187.57
7,409.36,236.09,1229.98,2.53,178.77
8,407.85,236.02,1229.75,2.53,180.50
9,407.90,236.20,1230.14,2.57,180.08


In [16]:
print(len(x))
print(len(y))

296
296
