#### PCA before modeling

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

# Load dataset
data_set = pd.read_csv('solar_data_transformed14.csv')
data = data_set.drop(columns=['ID'])

In [2]:
# Step 1: Perform PCA
n_components = 100  # Change to the number of components
pca = PCA(n_components=n_components)

In [3]:
pca.fit(data)

# Step 3: Create a DataFrame for explained variance with PC
explained_variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))],
    'Explained Variance Ratio': pca.explained_variance_ratio_
})
# Print the explained_variance_ratio_
print("Principal Components and Explained Variance Ratios:")
for i, var_ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {var_ratio:.4f}")

Principal Components and Explained Variance Ratios:
PC1: 0.1954
PC2: 0.1224
PC3: 0.0824
PC4: 0.0686
PC5: 0.0411
PC6: 0.0385
PC7: 0.0310
PC8: 0.0253
PC9: 0.0227
PC10: 0.0165
PC11: 0.0161
PC12: 0.0137
PC13: 0.0130
PC14: 0.0114
PC15: 0.0100
PC16: 0.0085
PC17: 0.0080
PC18: 0.0078
PC19: 0.0068
PC20: 0.0062
PC21: 0.0060
PC22: 0.0052
PC23: 0.0051
PC24: 0.0048
PC25: 0.0047
PC26: 0.0043
PC27: 0.0041
PC28: 0.0038
PC29: 0.0038
PC30: 0.0035
PC31: 0.0034
PC32: 0.0033
PC33: 0.0032
PC34: 0.0031
PC35: 0.0031
PC36: 0.0029
PC37: 0.0026
PC38: 0.0026
PC39: 0.0025
PC40: 0.0025
PC41: 0.0024
PC42: 0.0024
PC43: 0.0023
PC44: 0.0023
PC45: 0.0022
PC46: 0.0022
PC47: 0.0021
PC48: 0.0021
PC49: 0.0021
PC50: 0.0020
PC51: 0.0019
PC52: 0.0019
PC53: 0.0018
PC54: 0.0018
PC55: 0.0017
PC56: 0.0017
PC57: 0.0016
PC58: 0.0016
PC59: 0.0016
PC60: 0.0016
PC61: 0.0016
PC62: 0.0015
PC63: 0.0015
PC64: 0.0014
PC65: 0.0014
PC66: 0.0013
PC67: 0.0013
PC68: 0.0012
PC69: 0.0012
PC70: 0.0012
PC71: 0.0012
PC72: 0.0012
PC73: 0.0011
PC74: 0.

In [4]:
# Save to a CSV
explained_variance_df.to_csv('explained_variance.csv', index=False)

print("Explained variance ratio saved to 'explained_variance.csv'")

Explained variance ratio saved to 'explained_variance.csv'


In [5]:
# Step 4: Get the loadings of each feature in each principal component
loadings = pca.components_

# Step 5: Display loadings for each feature and each principal component
loadings_df = pd.DataFrame(loadings, columns=data.columns)

In [6]:
explained_variance_df.head()

Unnamed: 0,Principal Component,Explained Variance Ratio
0,PC1,0.19539
1,PC2,0.122381
2,PC3,0.08241
3,PC4,0.068607
4,PC5,0.04107


In [7]:
explained_variance_df.shape

(100, 2)

In [8]:
print("\nPCA Loadings (how each feature contributes to each PC):")
#print(loadings_df)
# Save to a CSV
loadings_df.to_csv('loadings_per_PCA.csv', index=False)
loadings_df.head()


PCA Loadings (how each feature contributes to each PC):


Unnamed: 0,InstallType,HIFLD_100,HIFLD_200,HIFLD_CAISO,Area,Length,County_Alameda,County_Amador,County_Butte,County_Calaveras,...,DistSub_200_binned_very close,DistSub_200_binned_close,DistSub_200_binned_moderate,DistSub_200_binned_far,DistSub_200_binned_very far,DistSub_CAISO_binned_very close,DistSub_CAISO_binned_close,DistSub_CAISO_binned_moderate,DistSub_CAISO_binned_far,DistSub_CAISO_binned_very far
0,-0.365765,0.035375,0.033765,0.068946,0.550525,0.545683,-0.013567,0.000185,-0.00206,-2.1e-05,...,-0.002946,-0.090049,0.001236,0.053331,0.03767,0.002645,-0.039589,0.012935,0.023758,0.0
1,-0.018827,-0.633163,-0.651103,-0.3778,0.053951,0.055784,-0.004839,-9.5e-05,0.004406,0.000366,...,-0.010357,0.034737,0.002016,-0.006299,-0.019812,-0.00194,0.028366,-0.038161,0.011545,-0.0
2,0.454221,0.170508,0.024526,-0.233763,0.40571,0.418767,0.009745,-0.000398,-0.006359,-0.000618,...,0.019585,0.132073,-0.020513,-0.07199,-0.059433,0.013281,0.131165,-0.062013,-0.082338,-0.0
3,-0.163607,0.243885,0.252454,-0.87049,-0.089916,-0.089905,-0.015969,0.000489,0.004165,0.000353,...,0.001436,-0.072832,0.057172,0.028301,-0.014273,-0.01141,-0.00172,0.021291,-0.007723,0.0
4,0.27447,-0.533666,0.442267,-0.018696,0.041832,0.040992,0.004754,0.000165,-0.004156,0.000247,...,0.047603,-0.244276,0.12557,0.039096,0.031977,0.009934,-0.289425,0.218309,0.060485,0.0


In [9]:
loadings_df.shape

(100, 1090)

In [10]:
# Extract the loadings (components)
feature_loadings = np.abs(pca.components_)  # Absolute values of loadings

# Calculate the total contribution across all PCs
total_contribution = np.sum(feature_loadings, axis=0)  # Sum across rows (all principal components)

# Create a DataFrame with the total contribution of each feature
contribution_df = pd.DataFrame(
    total_contribution, 
    index=data.columns, 
    columns=['Total Contribution']
)

# Sort the features by total contribution in descending order
contribution_df = contribution_df.sort_values(by='Total Contribution', ascending=False)

# Output the result
print("Total Contribution of Features (Descending Order):")
print(contribution_df)

# Optional: Save to CSV
contribution_df.to_csv("total_feature_contribution.csv")

Total Contribution of Features (Descending Order):
                               Total Contribution
County_Tulare                            6.363268
DistSub_200_binned_very close            5.768622
Substation_200_McCall                    5.766229
DistSub_200_binned_very far              5.724048
County_Kings                             5.719402
...                                           ...
Substation_100_Bordertown                0.030725
Substation_200_Tiger Creek               0.028601
Substation_100_Tiger Creek               0.028601
DistSub_100_binned_very far              0.000000
DistSub_CAISO_binned_very far            0.000000

[1090 rows x 1 columns]


In [11]:
#X_pca is the transformed dataset from PCA
X_pca = pca.fit_transform(data)  # Where data is original dataset

# Assuming X_pca is the transformed data from PCA
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# Save to a CSV file
X_pca_df.to_csv('X_pca.csv', index=False)

In [12]:
#(For our reference)
explained_variance = pca.explained_variance_ratio_
# Step 3: Identify top features based on loadings and variance
feature_variance_contributions = []

# Loop through each principal component
for i in range(len(pca.components_)):
    # Get the loadings for the current principal component
    feature_loading = pca.components_[i]
    
    # Loop through each feature and calculate its variance contribution
    for j, feature in enumerate(data.columns):
        # Calculate the variance contribution using the formula:
        # Variance Contribution = (Loading of Feature)^2 * Explained Variance of Component
        variance_contribution = (feature_loading[j]**2) * explained_variance[i]
        
        # Append the feature and its calculated variance contribution
        feature_variance_contributions.append((feature, variance_contribution))

# Step 4: Retain only the unique features with the highest variance
unique_features = {}
for feature, variance in feature_variance_contributions:
    # Retain the feature with the highest variance contribution
    if feature not in unique_features or variance > unique_features[feature]:
        unique_features[feature] = variance

# Step 5: Sort the unique features by variance in descending order and limit to top 1000
unique_features_sorted = sorted(unique_features.items(), key=lambda x: x[1], reverse=True)[:10]

# Step 6: Output the top 100 unique features with variance
print("Top 10 Unique Features with Variance:")
for feature, variance in unique_features_sorted:
    print(f"{feature} - variance: {variance:.4f}")

Top 10 Unique Features with Variance:
Area - variance: 0.0592
Length - variance: 0.0582
HIFLD_CAISO - variance: 0.0520
HIFLD_200 - variance: 0.0519
HIFLD_100 - variance: 0.0491
InstallType - variance: 0.0261
DistSub_CAISO_binned_close - variance: 0.0086
SolarTech_Outside - variance: 0.0079
SolarTech_Within - variance: 0.0079
Class_Ground - Rural - variance: 0.0076
