In [None]:
import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
from sklearn.model_selection import train_test_split

def loadDataSet(filename):
    '''导入数据
    input: filename:文件名
    '''
    dataMat = []  # 自变量
    labelMat = []  # 标签值
    fr = open(filename)
    for line in fr.readlines():
        lineArr = line.strip().split('\t')
        dataMat.append(float(lineArr[0]))
        labelMat.append(float(lineArr[1]))
    return mat(dataMat).T, mat(labelMat).T  # 分别对应两列数据


# 返回数据集中每一个数据向量与A的核函数值组成的矩阵
def kernelTrans(X, A, kTup):
    '''数据集中每一个数据向量与A的核函数值
    input: X--特征数据集     自变量整体
           A--输入向量      每一个自变量
           kTup--核函数参量定义     ('rbf', 0.3)
    output: K--数据集中每一个数据向量与A的核函数值组成的矩阵
    '''
    X = mat(X)  # 类似X.shape(-1,1)
    m, n = shape(X)  # 自变量维度
    K = mat(zeros((m, 1)))  # 取0矩阵
    if kTup[0] == 'lin':  # kTup = ('rbf', 0.3)
        K = X * A.T
    elif kTup[0] == 'rbf':
        for j in range(m):
            deltaRow = X[j] - A
            K[j] = deltaRow * deltaRow.T
        K = exp(K / (-1 * kTup[1] ** 2))
    else:
        raise NameError('Houston We Have a Problem ,That Kernel is not recognized')
    return K  # K就是高斯函数代入数据后的具体值


class optStruct:
    def __init__(self, dataMatIn, classLabels, C, kTup):
        self.X = dataMatIn
        self.labelMat = classLabels
        self.C = C
        self.m = shape(dataMatIn)[0]
        self.alphas = mat(zeros((self.m, 1)))
        self.b = 0
        self.K = mat(zeros((self.m, self.m)))  # 特征数据集合中向量两两核函数值组成的矩阵，[i,j]表示第i个向量与第j个向量的核函数值
        for i in range(self.m):
            self.K[:, i] = kernelTrans(self.X, self.X[i, :], kTup)


def leastSquares(dataMatIn, classLabels, C, kTup):
    '''最小二乘法求解alpha序列
    input:dataMatIn:特征数据集
          classLabels:分类标签集
          C:参数，（松弛变量，允许有些数据点可以处于分隔面的错误一侧)
          kTup: 核函数类型和参数选择
    output:b--w.T*x+b=y中的b
           alphas:alphas序列
    '''
    ##1.参数设置
    oS = optStruct(dataMatIn, classLabels, C, kTup)
    unit = mat(ones((oS.m, 1)))  # [1,1,...,1].T
    I = eye(oS.m)
    zero = mat(zeros((1, 1)))
    upmat = hstack((zero, unit.T))
    downmat = hstack((unit, oS.K + I / float(C)))

    ##2.方程求解
    completemat = vstack((upmat, downmat))  # lssvm中求解方程的左边矩阵
    rightmat = vstack((zero, oS.labelMat))  # lssvm中求解方程的右边矩阵
    b_alpha = completemat.I * rightmat
    ##3.导出偏置b和Lagrange乘子序列
    oS.b = b_alpha[0, 0]
    for i in range(oS.m):
        oS.alphas[i, 0] = b_alpha[i + 1, 0]
    e = oS.alphas / C
    return oS.alphas, oS.b, e


def predict(alphas, b, dataMat):    
    '''预测结果
    input:alphas(mat):WLSSVM模型的Lagrange乘子序列
          b(float):WLSSVM模型回归方程的偏置
          dataMat(mat):测试样本集
    output:predict_result(mat):测试结果
    '''
    m, n = shape(dataMat)
    predict_result = mat(zeros((m, 1)))
    for i in range(m):
        Kx = kernelTrans(dataMat, dataMat[i, :], kTup)  # 可以对alphas进行稀疏处理找到更准确的值
        predict_result[i, 0] = Kx.T * alphas + b
    return predict_result


def predict_average_error(predict_result, label):
    '''计算平均预测误差
    input:predict_result(mat):预测结果
          label(mat):实际结果
    output:average_error(float):平均误差
    '''
    m, n = shape(predict_result)
    error = 0.0
    for i in range(m):
        error += abs(predict_result[i, 0] - label[i, 0])
    average_error = error / m
    return average_error


if __name__ == '__main__':
    ##1.数据导入
    print('--------------------Load Data------------------------')
    dataMat, labelMat = loadDataSet('sine1.txt')
    print(type(dataMat),len(dataMat),type(labelMat),len(labelMat))
    test=pd.read_csv('test.csv',encoding = "gbk")
    test.columns = ['A','B','C']
    dataMat=test['A'].to_numpy()
    labelMat=test['C'].to_numpy()
    dataMat=mat(dataMat.reshape(-1,1))
    labelMat=mat(labelMat.reshape(-1,1))
    X_train, X_test, y_train, y_test = train_test_split(   dataMat, labelMat, test_size=0.33, random_state=42)
    print(type(dataMat),len(dataMat),type(labelMat),len(labelMat))
    ##2.参数设置
    print('--------------------Parameter Setup------------------')
    C = 0.6
    k1 = 0.3
    kernel = 'rbf'  # 这里核函数选择了RBF
    kTup = (kernel, k1)  # k1 = 0.3
    ##3.求解LSSVM模型
    print('-------------------Save LSSVM Model-----------------')
    alphas, b, e = leastSquares(X_train, y_train, C, kTup)
    ##4.预测结果
    print('------------------Predict Result------------------ -')
    predict_result =  predict(alphas, b, X_test)
    ##5.平均误差
    print('-------------------Average Error------------------ -')
    average_error = predict_average_error(predict_result, y_test)

    print('-------------------Average Error------------------ -',average_error)

--------------------Load Data------------------------
<class 'numpy.matrix'> 199 <class 'numpy.matrix'> 199
<class 'numpy.matrix'> 4864 <class 'numpy.matrix'> 4864
--------------------Parameter Setup------------------
-------------------Save LSSVM Model-----------------


In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(   dataMat,labelMat , test_size=0.5, random_state=42)

In [86]:
X_train

matrix([[1313],
        [ 678],
        [3464],
        ...,
        [3092],
        [3772],
        [ 860]], dtype=int64)