In [5]:
import pandas as pd
import numpy as np

df = pd.read_excel("winequality-red.xlsx")
print("\nDataset Head \n")
print(df.head())
print("\nDataset describe \n")
print(df.describe())
print(" \n Dataset Info \n")
print(df.info())
print(" \n Dataset Columns \n")
print(df.columns)

# Display the number of missing values before replacement
print("\n\n Missing values before imputation:\n")
print(df.isnull().sum())

df.fillna(df.mean(numeric_only=True), inplace=True)


Dataset Head 

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2   

In [6]:
#Extract the following columns as vectors: alcohol,citric acid

alcohol_vector = df["alcohol"].values
citric_acid_vector = df['citric acid'].values

print("\n'alcohol' column extracted as a vector:")
print(alcohol_vector)

print("\n'citric acid' column extracted as a vector:")
print(citric_acid_vector)


'alcohol' column extracted as a vector:
[ 9.4  9.8  9.8 ... 11.  10.2 11. ]

'citric acid' column extracted as a vector:
[0.   0.   0.04 ... 0.13 0.12 0.47]


In [12]:
'''
Select two features (e.g., alcohol and density) from the dataset and calculate 
the covariance matrix using np.cov(X.T), where X is the feature matrix consisting 
of the selected columns.

'''

# feature_matrix = np.array([[df["alcohol"].values] [df["density"].values]], dtype=float)
# Select the features 'alcohol' and 'density'

features = ['alcohol', 'density']
X = df[features].values
# Calculate the covariance matrix (note:
# np.cov expects rows as variables by default, so we transpose X)
cov_matrix = np.cov(X.T)


# 5. Print the results
print("\nSelected Features: Alcohol and Density")
print("\nCovariance Matrix:\n", cov_matrix)


# For interpretation:
print("\nInterpretation:")
print(f"Variance of Alcohol: {cov_matrix[0, 0]:.6f}")
print(f"Covariance (Alcohol, Density): {cov_matrix[0, 1]:.6f}")
print(f"Covariance (Density, Alcohol): {cov_matrix[1, 0]:.6f}")
print(f"Variance of Density: {cov_matrix[1, 1]:.6f}")


Selected Features: Alcohol and Density

Covariance Matrix:
 [[ 1.13564740e+00 -9.97951790e-04]
 [-9.97951790e-04  3.56202945e-06]]
-0.0009979517895258383

Interpretation:
Variance of Alcohol: 1.135647
Covariance (Alcohol, Density): -0.000998
Covariance (Density, Alcohol): -0.000998
Variance of Density: 0.000004


In [13]:
# Perform eigen decomposition
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvalues in descending order and get corresponding eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

print("\nEigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)


Eigenvalues:
[1.13564827e+00 2.68507580e-06]

Eigenvectors:
[[ 9.99999614e-01  8.78753184e-04]
 [-8.78753184e-04  9.99999614e-01]]


In [14]:
# Top 2 (in this case, all) eigenvalues and eigenvectors
top_eigenvalues = eigenvalues[:2]
top_eigenvectors = eigenvectors[:, :2]

quality_counts = df['quality'].value_counts()
most_common_quality = quality_counts.idxmax()

print("The most common wine quality is:", most_common_quality)

The most common wine quality is: 5
