In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import matplotlib.pyplot as plt

In [None]:
data, _ = scipy.io.arff.loadarff('data/rice.arff')
df = pd.DataFrame(data)
print("Shape:", df.shape)
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(df, x='Class', hue='Class')
plt.title("Class Distribution")
plt.show()

In [None]:
print("Mean of Features")
df.iloc[:, :-1].mean()

In [None]:
cov = df.iloc[:, :-1].cov().round(3)
print("Covariance Matrix")
cov

In [None]:
eigen_vals, eigen_vecs = np.linalg.eig(cov)
mapping = {round(eigen_vals[i], 2): eigen_vecs[:, i].round(2) 
           for i in range(len(eigen_vals))}
print("Eigen Value-Vector Pairs:")
mapping

In [None]:
n = 2
sorted_eigen_vals = sorted(mapping.keys(), reverse=True)
top_eigen_vals = sorted_eigen_vals[:n]
top_eigen_vals

In [None]:
projection_matrix = np.array([mapping[val] for val in top_eigen_vals]).T
projection_matrix

In [None]:
X = df.iloc[:, :-1].values
reduced_data = X.dot(projection_matrix)
reduced_df = pd.DataFrame(reduced_data, columns=[f'PC{i+1}' for i in range(n)])
reduced_df['Class'] = df['Class'].values
reduced_df.head()

In [None]:
sns.scatterplot(data=reduced_df, x='PC1', y='PC2', hue='Class')
plt.title('Principle Component Analysis')
plt.show()