## Principal Component Analysis (PCA) for Dimensionality Reduction

### Imports and Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Load 5-Fold Cross-Validation Non-Normalized Data

In [2]:
# Function to load the KFold = 5 non-normalized data
def load_kfold_data(fold):
    train = pd.read_csv(f'../dataset_generators/datasets/{fold}_train_non-normalized.csv')
    test = pd.read_csv(f'../dataset_generators/datasets/{fold}_test_non-normalized.csv')
    return train, test

### Extract Each Fold and Apply PCA

In [6]:
# Function to extract each fold's data, apply PCA, and return transformed datasets
def extract_each_fold_with_pca(n_components=0.95):
    for fold in range(5):
        train, test = load_kfold_data(fold)
        X_train = train.iloc[:, 12:].drop(columns=['fantasy_points'])
        y_train = train['fantasy_points']
        X_test = test.iloc[:, 12:].drop(columns=['fantasy_points'])
        y_test = test['fantasy_points']
        
        # Normalize data with StandardScaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Apply PCA
        pca = PCA(n_components=n_components)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        yield X_train_pca, y_train, X_test_pca, y_test, pca

In [7]:
pca_list = list(extract_each_fold_with_pca())

# Loop pca_list and store each PCA in it's own pickle file for later use
import pickle
for i, pca in enumerate(pca_list):
    with open(f'../dataset_generators/datasets/{i}_pca.pkl', 'wb') as f:
        pickle.dump(pca[4], f)

In [8]:
# Get each fold's PCA transformed data
pca_data = list(extract_each_fold_with_pca())
pca_data[3]  # Example to check the fourth fold's data

(array([[-0.76674383, -2.73706166, -0.81191576, ..., -0.02519278,
         -0.41911951, -0.05732718],
        [-1.01128538, -2.77333913, -0.86428403, ..., -0.04242404,
          0.52887961, -0.04809846],
        [ 3.56780352, -3.87825119,  0.36672879, ..., -0.34553452,
         -0.00616753,  0.26097385],
        ...,
        [-0.29413727,  1.39709778, -2.69357012, ..., -0.04855384,
          1.05162909, -0.05760293],
        [-0.55400144,  0.71214974, -2.52019101, ..., -0.57712991,
         -0.35096398,  0.06366663],
        [-0.76263757,  0.17462775, -2.37871372, ..., -0.34804834,
         -0.79895919,  0.04211365]]),
 0        -0.20
 1         7.04
 2         9.40
 3        15.38
 4        16.22
          ...  
 48770     2.60
 48771     8.70
 48772     1.80
 48773     2.00
 48774     1.90
 Name: fantasy_points, Length: 48775, dtype: float64,
 array([[ 7.07586823e+00, -2.26950811e+00,  2.07720359e+00, ...,
         -1.20596464e-01,  4.45936596e-01,  1.59148143e-01],
        [ 1.46287

In [9]:
def generate_pca_datasets(pca_data):
    # Loop through each fold's PCA data and save to CSV
    for fold, (X_train_pca, y_train, X_test_pca, y_test, _) in enumerate(pca_data):
        # Load the original fold data to get the first 12 columns
        train, test = load_kfold_data(fold)
        
        # Get first 12 columns from original data
        first_12_cols_train = train.iloc[:, :12]
        first_12_cols_test = test.iloc[:, :12]
        
        # Create dataframes with PCA features
        pca_cols = [f'PC{i+1}' for i in range(X_train_pca.shape[1])]
        train_pca_df = pd.DataFrame(X_train_pca, columns=pca_cols, index=y_train.index)
        test_pca_df = pd.DataFrame(X_test_pca, columns=pca_cols, index=y_test.index)
        
        # Combine first 12 columns with PCA features and target
        train_final = pd.concat([first_12_cols_train, train_pca_df], axis=1)
        train_final['fantasy_points'] = y_train
        
        test_final = pd.concat([first_12_cols_test, test_pca_df], axis=1)
        test_final['fantasy_points'] = y_test
        
        # Save to CSV
        train_final.to_csv(f'../dataset_generators/datasets/{fold}_train_pca.csv', index=False)
        test_final.to_csv(f'../dataset_generators/datasets/{fold}_test_pca.csv', index=False)
        
generate_pca_datasets(pca_data)