In [1]:
# Exercise 8: Wine

In [2]:
## Q1: Load wine data with proper optimization

In [12]:
import numpy as np

wine_data = np.genfromtxt('winequality-red.csv', delimiter=';', skip_header=1, dtype=np.float32)
print("Data shape", wine_data.shape)
print("Data size in bytes:", wine_data.nbytes)

Data shape (1599, 12)
Data size in bytes: 76752


In [13]:
## Q2: Display 2nd, 7th, and 12th rows (1-indexed becomes 0-indexed: 1, 6, 11)

In [45]:
selected_rows = wine_data[[1, 6, 11], :]

# select all rows from 'selected_rows' that do not contain any NaN values
clean_rows = selected_rows[~np.isnan(selected_rows).any(axis=1)]
print("Selected rows (2nd, 7th, 12th):")
print(clean_rows)

Selected rows (2nd, 7th, 12th):
[[7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00]
 [7.900e+00 6.000e-01 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01
  9.964e-01 3.300e+00 4.600e-01 9.400e+00 5.000e+00]
 [7.500e+00 5.000e-01 3.600e-01 6.100e+00 7.100e-02 1.700e+01 1.020e+02
  9.978e-01 3.350e+00 8.000e-01 1.050e+01 5.000e+00]]


In [18]:
## Q3: Any wine with alcohol > 20%?

In [19]:
alcohol_gt_20 = np.any(wine_data[:, 10] > 20) #alcohol is index 11
print("Alcohol > 20%:", alcohol_gt_20)

Alcohol > 20%: False


In [20]:
## Q4: Mean alcohol percentage (ignore NaN)

In [24]:
mean_alcohol = np.nanmean(wine_data[:, 10])
print(f"Mean alcohol: {mean_alcohol:.3f}")

Mean alcohol: 10.423


In [25]:
## Q5: Stats for pH (column index 9)

In [43]:
pH_col = wine_data[:, 8]
stats = {
    'min': round(np.nanmin(pH_col), 2),
    'max': round(np.nanmax(pH_col), 2),
    '25%': round(np.nanpercentile(pH_col, 25), 2),
    '50%': round(np.nanpercentile(pH_col, 50), 2),  # median
    '75%': round(np.nanpercentile(pH_col, 75), 2),
    'mean': round(np.nanmean(pH_col), 2)
}
print("pH Statistics:", stats)

pH Statistics: {'min': 2.74, 'max': 4.01, '25%': 3.21, '50%': 3.31, '75%': 3.4, 'mean': 3.31}


In [28]:
## Q6: Average quality of wines with 20% least sulphate content

In [39]:
sulphates = wine_data[:, 9]  #10th column
quality = wine_data[:, 11]  #11th column

p20 = np.nanpercentile(sulphates, 20)
mask = sulphates < p20
low_sulphur_quality_mean = np.nanmean(quality[mask])

print(f"Low sulphate avg quality: {low_sulphur_quality_mean:.1f}")

Low sulphate avg quality: 5.2


In [35]:
## Q7: Mean of all variables for best/worst quality

In [37]:
quality = wine_data[:, 11]
best_quality = np.nanmax(quality)
worst_quality = np.nanmin(quality)

best_mask = quality == best_quality
worst_mask = quality == worst_quality

mean_best = np.nanmean(wine_data[best_mask], axis=0)
mean_worst = np.nanmean(wine_data[worst_mask], axis=0)

print("Mean for best quality wines:\n", mean_best)
print("Mean for worst quality wines:\n", mean_worst)

Mean for best quality wines:
 [ 8.566666    0.4233333   0.39111114  2.5777776   0.06844445 13.277778
 33.444443    0.99521226  3.2672222   0.76777774 12.094444    8.        ]
Mean for worst quality wines:
 [ 8.359999    0.8845      0.17099999  2.6350002   0.12249999 11.
 24.9         0.997464    3.398       0.57000005  9.955       3.        ]
