In [2]:
import numpy as np

# Load the data and optimize types
data = np.genfromtxt('winequality-red.csv', delimiter=';', skip_header=1, dtype=np.float32)

# Check the memory size
original_memory_size = data.nbytes

# Calculate the sum of absolute differences
optimized_data = data.astype(np.float32)
diff_sum = np.sum(np.abs(data - optimized_data))

# Check if the memory size is 76800 bytes and the sum of differences is smaller than 1.10**-3
if optimized_data.nbytes == 76800 and diff_sum < 1.10**-3:
    print("Memory optimization successful.")

# Print the 2nd, 7th, and 12th rows as a 2D array
print(optimized_data[[1, 6, 11], :])

# Check if any wine has alcohol percentage greater than 20%
alcohol_gt_20 = np.any(optimized_data[:, -2] > 20)
print(f"Any wine with alcohol > 20%: {alcohol_gt_20}")

# Calculate the average alcohol percentage, excluding NaN values
average_alcohol = np.nanmean(optimized_data[:, -2])
print(f"Average alcohol percentage: {average_alcohol:.2f}")

# Calculate pH statistics
ph_data = optimized_data[:, 8]
min_ph = np.min(ph_data)
max_ph = np.max(ph_data)
percentiles = np.percentile(ph_data, [25, 50, 75])
median_ph = np.median(ph_data)

print(f"Minimum pH: {min_ph:.2f}")
print(f"Maximum pH: {max_ph:.2f}")
print(f"25th Percentile pH: {percentiles[0]:.2f}")
print(f"50th Percentile (Median) pH: {median_ph:.2f}")
print(f"75th Percentile pH: {percentiles[2]:.2f}")

# Calculate the average quality of wines with the 20% least sulfates
sulfates_data = optimized_data[:, -3]
threshold = np.percentile(sulfates_data, 20)
average_quality_low_sulfates = np.mean(optimized_data[sulfates_data < threshold, 11])

print(f"Average quality of wines with 20% least sulfates: {average_quality_low_sulfates:.2f}")

# Calculate the mean of all variables for wines with the best and worst quality
best_quality = np.max(optimized_data[:, -1])
worst_quality = np.min(optimized_data[:, -1])

mean_best_quality = np.mean(optimized_data[optimized_data[:, -1] == best_quality, :-1], axis=0)
mean_worst_quality = np.mean(optimized_data[optimized_data[:, -1] == worst_quality, :-1], axis=0)

print("Mean variables for wines with the best quality:")
print(mean_best_quality)

print("Mean variables for wines with the worst quality:")
print(mean_worst_quality)

[[7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00]
 [7.900e+00 6.000e-01 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01
  9.964e-01 3.300e+00 4.600e-01 9.400e+00 5.000e+00]
 [7.500e+00 5.000e-01 3.600e-01 6.100e+00 7.100e-02 1.700e+01 1.020e+02
  9.978e-01 3.350e+00 8.000e-01 1.050e+01 5.000e+00]]
Any wine with alcohol > 20%: False
Average alcohol percentage: 10.42
Minimum pH: 2.74
Maximum pH: 4.01
25th Percentile pH: 3.21
50th Percentile (Median) pH: 3.31
75th Percentile pH: 3.40
Average quality of wines with 20% least sulfates: 5.19
Mean variables for wines with the best quality:
[ 8.566666    0.4233333   0.39111114  2.5777776   0.06844445 13.277778
 33.444443    0.99521226  3.2672222   0.76777774 12.094444  ]
Mean variables for wines with the worst quality:
[ 8.359999    0.8845      0.17099999  2.6350002   0.12249999 11.
 24.9         0.997464    3.398       0.57000005  9.955     ]
