In [1]:
import pandas as pd

# Load the dataset with semicolon as the delimiter
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=';')

# Preview the dataset
print(df.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [2]:
# Count of wines for each quality rating
quality_counts = df['quality'].value_counts().sort_index()
print("Wine count by quality:\n", quality_counts)

# Percentage of wines in each quality category
quality_percentage = (quality_counts / len(df)) * 100
print("\nPercentage of wines by quality:\n", quality_percentage.round(2))


Wine count by quality:
 quality
3     10
4     53
5    681
6    638
7    199
8     18
Name: count, dtype: int64

Percentage of wines by quality:
 quality
3     0.63
4     3.31
5    42.59
6    39.90
7    12.45
8     1.13
Name: count, dtype: float64


In [3]:
# Correlation matrix
correlation_matrix = df.corr(numeric_only=True)
print("\nCorrelation matrix:\n", correlation_matrix)

# Feature with highest positive correlation with 'quality'
quality_corr = correlation_matrix['quality'].drop('quality')
top_corr_feature = quality_corr.idxmax()
top_corr_value = quality_corr.max()
print(f"\nFeature most positively correlated with quality: {top_corr_feature} ({top_corr_value:.2f})")



Correlation matrix:
                       fixed acidity  volatile acidity  citric acid  \
fixed acidity              1.000000         -0.256131     0.671703   
volatile acidity          -0.256131          1.000000    -0.552496   
citric acid                0.671703         -0.552496     1.000000   
residual sugar             0.114777          0.001918     0.143577   
chlorides                  0.093705          0.061298     0.203823   
free sulfur dioxide       -0.153794         -0.010504    -0.060978   
total sulfur dioxide      -0.113181          0.076470     0.035533   
density                    0.668047          0.022026     0.364947   
pH                        -0.682978          0.234937    -0.541904   
sulphates                  0.183006         -0.260987     0.312770   
alcohol                   -0.061668         -0.202288     0.109903   
quality                    0.124052         -0.390558     0.226373   

                      residual sugar  chlorides  free sulfur dioxid

In [4]:
# New binary column: is_high_quality
df['is_high_quality'] = df['quality'] >= 7

# Check value counts
print("\nHigh quality wine count:\n", df['is_high_quality'].value_counts())



High quality wine count:
 is_high_quality
False    1382
True      217
Name: count, dtype: int64
