In [4]:
import numpy as np

# Load wine quality data
wine_data = np.genfromtxt('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', delimiter=';', skip_header=1, dtype=np.float32)


# Basic data info
print("Data size:", wine_data.nbytes, "bytes")
print("Data shape:", wine_data.shape)
print("\nRows 2, 7, 12:")
print(wine_data[[1, 6, 11]])  # Using 0-based indexing

# Alcohol analysis (column 10)
print("\nAlcohol Analysis:")
print("Any wine with alcohol > 20%:", np.any(wine_data[:, 10] > 20))
print("Average alcohol content:", np.nanmean(wine_data[:, 10]))

# pH statistics (column 8)
print("\npH Statistics:")
pH_stats = {
    'min': np.nanmin(wine_data[:, 8]),
    'max': np.nanmax(wine_data[:, 8]),
    '25th percentile': np.nanpercentile(wine_data[:, 8], 25),
    '50th percentile (median)': np.nanpercentile(wine_data[:, 8], 50),
    '75th percentile': np.nanpercentile(wine_data[:, 8], 75),
    'mean': np.nanmean(wine_data[:, 8])
}
for key, value in pH_stats.items():
    print(f"  {key}: {value:.3f}")

# Sulphate analysis (column 9)
print("\nSulphate Analysis:")
sulphate_20th = np.nanpercentile(wine_data[:, 9], 20)
print(f"20th percentile sulphate level: {sulphate_20th:.3f}")

low_sulphate_mask = wine_data[:, 9] <= sulphate_20th
avg_quality_low_sulphate = np.nanmean(wine_data[low_sulphate_mask, 11])
print(f"Average quality of wines with low sulphate: {avg_quality_low_sulphate:.3f}")

# Quality comparison (column 11)
print("\nQuality Comparison:")
best_quality = np.nanmax(wine_data[:, 11])
worst_quality = np.nanmin(wine_data[:, 11])
print(f"Best quality score: {best_quality}")
print(f"Worst quality score: {worst_quality}")

# Mean characteristics of best and worst quality wines
best_wines_mask = wine_data[:, 11] == best_quality
worst_wines_mask = wine_data[:, 11] == worst_quality

best_mean = np.nanmean(wine_data[best_wines_mask], axis=0)
worst_mean = np.nanmean(wine_data[worst_wines_mask], axis=0)

print(f"\nNumber of best quality wines: {np.sum(best_wines_mask)}")
print(f"Number of worst quality wines: {np.sum(worst_wines_mask)}")
print("\nMean characteristics of best quality wines:")
print(best_mean)
print("\nMean characteristics of worst quality wines:")
print(worst_mean)

Data size: 76752 bytes
Data shape: (1599, 12)

Rows 2, 7, 12:
[[7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00]
 [7.900e+00 6.000e-01 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01
  9.964e-01 3.300e+00 4.600e-01 9.400e+00 5.000e+00]
 [7.500e+00 5.000e-01 3.600e-01 6.100e+00 7.100e-02 1.700e+01 1.020e+02
  9.978e-01 3.350e+00 8.000e-01 1.050e+01 5.000e+00]]

Alcohol Analysis:
Any wine with alcohol > 20%: False
Average alcohol content: 10.422984

pH Statistics:
  min: 2.740
  max: 4.010
  25th percentile: 3.210
  50th percentile (median): 3.310
  75th percentile: 3.400
  mean: 3.311

Sulphate Analysis:
20th percentile sulphate level: 0.540
Average quality of wines with low sulphate: 5.216

Quality Comparison:
Best quality score: 8.0
Worst quality score: 3.0

Number of best quality wines: 18
Number of worst quality wines: 10

Mean characteristics of best quality wines:
[ 8.566666    0.4233333   0.39111114  2.57