In [1]:
# 读取红酒和白酒的原始数据
# 分别对红酒和白酒两个数据集的数据进行抽样显示，观察样本的数据格式和样本大小


import pandas as pd 
import matplotlib.pyplot as plt 

red = pd.read_csv("winequality-red.csv", sep=";", index_col=None)
white = pd.read_csv("winequality-white.csv", sep=";")

## 显示红酒和白酒数据集的基本信息
print "红酒数据集部分数据显示："
print red.head()
print "\n####################################################################\n"
print "红酒数据集的规格显示:"
print red.iloc[:,0].size, red.columns.size
print "\n\n####################################################################"
print "####################################################################\n\n"
print "白酒数据集部分数据显示："
print white.head()
print "\n####################################################################\n"
print "白酒数据集的规格显示:"
print white.iloc[:,0].size, white.columns.size

红酒数据集部分数据显示：
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      

In [2]:
# 对数据集进行整理
# 由于红酒和白酒是两个分开的数据集，我们需要把它们进行整合，方便后面进行分类任务。

red["type"] = 0
white["type"] = 1
wines = red.append(white, ignore_index=True)

## 将合并后的结果导出，方便下次直接进行处理
output_file = "./wines.csv"
wines.to_csv(output_file, index=False)

wines

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,0
1,7.8,0.880,0.00,2.60,0.098,25.0,67.0,0.99680,3.20,0.68,9.800000,5,0
2,7.8,0.760,0.04,2.30,0.092,15.0,54.0,0.99700,3.26,0.65,9.800000,5,0
3,11.2,0.280,0.56,1.90,0.075,17.0,60.0,0.99800,3.16,0.58,9.800000,6,0
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,0
5,7.4,0.660,0.00,1.80,0.075,13.0,40.0,0.99780,3.51,0.56,9.400000,5,0
6,7.9,0.600,0.06,1.60,0.069,15.0,59.0,0.99640,3.30,0.46,9.400000,5,0
7,7.3,0.650,0.00,1.20,0.065,15.0,21.0,0.99460,3.39,0.47,10.000000,7,0
8,7.8,0.580,0.02,2.00,0.073,9.0,18.0,0.99680,3.36,0.57,9.500000,7,0
9,7.5,0.500,0.36,6.10,0.071,17.0,102.0,0.99780,3.35,0.80,10.500000,5,0


In [3]:
# 对数据集进行切分，得到训练集和测试集

import numpy as np 

# X ，包含红酒的 11 个特征
X = wines.iloc[:,0:11]
# y ，需要预测的红酒品质结果
y = np.ravel(wines.type)

X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000
1,7.8,0.880,0.00,2.60,0.098,25.0,67.0,0.99680,3.20,0.68,9.800000
2,7.8,0.760,0.04,2.30,0.092,15.0,54.0,0.99700,3.26,0.65,9.800000
3,11.2,0.280,0.56,1.90,0.075,17.0,60.0,0.99800,3.16,0.58,9.800000
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000
5,7.4,0.660,0.00,1.80,0.075,13.0,40.0,0.99780,3.51,0.56,9.400000
6,7.9,0.600,0.06,1.60,0.069,15.0,59.0,0.99640,3.30,0.46,9.400000
7,7.3,0.650,0.00,1.20,0.065,15.0,21.0,0.99460,3.39,0.47,10.000000
8,7.8,0.580,0.02,2.00,0.073,9.0,18.0,0.99680,3.36,0.57,9.500000
9,7.5,0.500,0.36,6.10,0.071,17.0,102.0,0.99780,3.35,0.80,10.500000


In [4]:
# 对数据进行切分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [5]:
# 对数据进行标准化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LinearRegression

# 实现线性回归
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# 使用训练好的线性回归模型对测试集进行预测
y_pred = model.predict(X_test)


In [9]:
from sklearn.metrics import mean_squared_error

print "训练集上的效果评估："
y_pred_train = model.predict(X_train)
print "R^2系数：", model.score(X_train, y_train)
print "均方误差：", mean_squared_error(y_train, y_pred_train)

print "\n###########################################################\n"

print "测试集上的效果评估："
y_pred_test = model.predict(X_test)
print "R^2系数：", model.score(X_test, y_test)
print "均方误差：", mean_squared_error(y_test, y_pred_test)

训练集上的效果评估：
R^2系数： 0.8684739470951951
均方误差： 0.024431765423959532

###########################################################

测试集上的效果评估：
R^2系数： 0.8383246580007973
均方误差： 0.029916464808970682
