In [96]:
import pandas as pd
import numpy as np

## Load Data

In [97]:
df = pd.read_csv('data/HW2-Fall2025-Data.csv')
df

Unnamed: 0,Student ID,Test1,Homeworks,Quizzes,Term Paper,Presentation,Fianl-Exam
0,,Max:50,Max:400,Max:140,,,Max:100
1,,,,,,,
2,S1,41,244,136,76.0,50.0,90
3,S2,50,400,140,98.0,50.0,100
4,S3,43,327,88,88.0,50.0,79
5,S4,43,336,123,18.0,50.0,35
6,S5,41,232,100,44.0,50.0,79
7,S6,46,400,135,50.0,50.0,46
8,S7,47,310,125,100.0,50.0,87
9,S8,41,400,102,22.0,50.0,51


In [98]:
# drop first 2 rows and start from index 1
df = df.drop(index=[0, 1])
df = df.reset_index(drop=True)
df.index += 1
df

Unnamed: 0,Student ID,Test1,Homeworks,Quizzes,Term Paper,Presentation,Fianl-Exam
1,S1,41,244,136,76.0,50.0,90
2,S2,50,400,140,98.0,50.0,100
3,S3,43,327,88,88.0,50.0,79
4,S4,43,336,123,18.0,50.0,35
5,S5,41,232,100,44.0,50.0,79
6,S6,46,400,135,50.0,50.0,46
7,S7,47,310,125,100.0,50.0,87
8,S8,41,400,102,22.0,50.0,51
9,S9,42,128,84,49.0,40.0,62
10,S10,50,384,134,66.0,50.0,75


In [99]:
# drop Student ID since studentID = index + 1
df = df.drop(columns=['Student ID'])
df.head()

Unnamed: 0,Test1,Homeworks,Quizzes,Term Paper,Presentation,Fianl-Exam
1,41,244,136,76.0,50.0,90
2,50,400,140,98.0,50.0,100
3,43,327,88,88.0,50.0,79
4,43,336,123,18.0,50.0,35
5,41,232,100,44.0,50.0,79


In [100]:
# convert all columns to float
for col in df.columns:
    df[col] = df[col].astype(float)
df.dtypes

Test1           float64
Homeworks       float64
Quizzes         float64
Term Paper      float64
Presentation    float64
Fianl-Exam      float64
dtype: object

In [101]:
df.describe()

Unnamed: 0,Test1,Homeworks,Quizzes,Term Paper,Presentation,Fianl-Exam
count,30.0,30.0,30.0,30.0,30.0,30.0
mean,42.4,291.466667,114.833333,69.333333,48.666667,68.166667
std,6.360438,81.549111,20.93847,26.108967,3.457459,22.891172
min,27.0,128.0,74.0,18.0,40.0,24.0
25%,41.0,224.5,100.0,52.75,50.0,50.25
50%,42.5,293.5,122.5,73.0,50.0,72.0
75%,46.75,369.5,131.75,89.5,50.0,86.0
max,50.0,400.0,140.0,100.0,50.0,100.0


## Covariance Matrix

In [102]:
# get the covariance matrix of the normalized data
cov_matrix = df.cov()
cov_matrix

Unnamed: 0,Test1,Homeworks,Quizzes,Term Paper,Presentation,Fianl-Exam
Test1,40.455172,273.97931,70.448276,30.724138,12.965517,45.172414
Homeworks,273.97931,6650.257471,739.666667,-26.402299,138.91954,230.195402
Quizzes,70.448276,739.666667,438.41954,113.091954,42.528736,136.373563
Term Paper,30.724138,-26.402299,113.091954,681.678161,8.390805,362.011494
Presentation,12.965517,138.91954,42.528736,8.390805,11.954023,24.712644
Fianl-Exam,45.172414,230.195402,136.373563,362.011494,24.712644,524.005747


## Eigenvectos and eigenvalues

In [103]:
# find eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix) # use eigh since covariance matrix is symmetric

for i in range(len(eigenvalues)):
    print(f"Eigenvalue:  {eigenvalues[i]} , Eigenvector:  {eigenvectors[:, i]}")

Eigenvalue:  5.785642558787699 , Eigenvector:  [-0.20195188 -0.00547211 -0.05047443  0.01981154  0.97750136 -0.02713705]
Eigenvalue:  23.923383727355773 , Eigenvector:  [ 9.72527189e-01 -2.96323537e-02 -1.14986447e-01 -7.80318932e-04
  1.93397433e-01 -5.18433300e-02]
Eigenvalue:  224.69166847056266 , Eigenvector:  [ 0.01725959 -0.01449754 -0.15325674 -0.59119225  0.02951157  0.79096466]
Eigenvalue:  319.49230020323716 , Eigenvector:  [-0.0919377   0.11595894 -0.94519996  0.28006685 -0.07191172  0.03300432]
Eigenvalue:  1011.2332951478061 , Eigenvector:  [ 0.0538199  -0.0548015   0.2314155   0.75608263  0.02428572  0.60687424]
Eigenvalue:  6761.64382483478 , Eigenvector:  [-4.19438636e-02 -9.91177139e-01 -1.17416892e-01 -4.76869592e-04
 -2.13658658e-02 -3.95618216e-02]


## Projections

In [104]:
# Eigenvectos corresponding to the two largest eigenvalues
W1 = eigenvectors[:, -1]
W2 = eigenvectors[:, -2]
W12 = np.column_stack((W1, W2))
W12


array([[-4.19438636e-02,  5.38199031e-02],
       [-9.91177139e-01, -5.48014984e-02],
       [-1.17416892e-01,  2.31415503e-01],
       [-4.76869592e-04,  7.56082631e-01],
       [-2.13658658e-02,  2.42857244e-02],
       [-3.95618216e-02,  6.06874236e-01]])

In [105]:
# project data onto the first two principal components
pca12 = df.dot(W12)
pca12 = pd.DataFrame(pca12)
pca12

Unnamed: 0,0,1
1,-264.200717,133.602806
2,-420.077622,149.166374
3,-340.486839,120.451352
4,-351.742923,48.429431
5,-247.629143,95.059205
6,-417.163534,78.730842
7,-328.471245,144.088617
8,-413.263514,52.689088
9,-141.826169,90.330427
10,-402.509981,109.288205
