In [2]:
import pandas as pd


df = pd.read_csv('./dataset/winequality-white.csv',sep=';')
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6


### Data Cleaning

#### Missing Value Handling: Identify and deal with missing values in the data.

In [3]:
null_values = df.isnull()
null_values.sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [4]:
df.dropna(inplace=True)

#### Duplicate Data Handling: Identify and deal with duplicate values in the data.

In [5]:
print(f'去除重复元素前df的形状: {df.shape}')
df.drop_duplicates(inplace=True)
print(f'去除重复元素后df的形状: {df.shape}')

去除重复元素前df的形状: (4898, 12)
去除重复元素后df的形状: (3961, 12)


### Data Integration

#### Combine data with the same attributes from different sources. For this assignment, calculate the “total acidity,” which is the sum of “fixed acidity” and “volatile acidity,” and add it as a new column to the dataset.

In [6]:
df['total acidity'] = df['fixed acidity'] + df['volatile acidity']
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,total acidity
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,7.27
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,6.6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,8.38


### Data Transformation

#### Normalization: Normalize the “quality” data to the [0,1] range

In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


# 初始化 MinMaxScaler
scaler = MinMaxScaler()
# Fit scaler 到数据并转换
df['quality_normalize'] = scaler.fit_transform(df[['quality']])
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,total acidity,quality_normalize
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,7.27,0.5
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,6.6,0.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,8.38,0.5


#### Discretization: Discretize the continuous attribute “fixed acidity” into three levels: “low,” “medium,” and “high.”

In [8]:
import pandas as pd

# 假设 df 是你的 DataFrame 并且 'fixed acidity' 是其中的一个连续属性列
# 使用 qcut 自动定义阈值，并将 'fixed acidity' 离散化为三个等级
df['fixed acidity level'] = pd.qcut(df['fixed acidity'], 3, labels=['low', 'medium', 'high'])
df.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,total acidity,quality_normalize,fixed acidity level
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,7.27,0.5,medium
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,6.6,0.5,low
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,8.38,0.5,high


### Data Reduction

#### Feature Selection: Use Analysis of Variance (ANOVA) to select the top three features that have the most significant impact on the quality rating of wine

In [9]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X = df.drop(['quality','fixed acidity level','quality_normalize'], axis=1)  # feature columns
y = df['quality']  # target column

# Feature selection using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=3)
X_new = selector.fit_transform(X, y)

# Get the indices of the features that were selected
selected_indices = selector.get_support(indices=True)

# Get the feature names based on the indices
selected_features = [X.columns[i] for i in selected_indices]

print('The top three features that have the most significant impact on the quality rating of wine are:')
for feature in selected_features:
    print(feature)

The top three features that have the most significant impact on the quality rating of wine are:
volatile acidity
density
alcohol
