#### PCA before modeling

In [1]:
import pandas as pd
from sklearn.decomposition import PCA

# Load dataset
data_set = pd.read_csv('solar_data_transformed14.csv')
data = data_set.drop(columns=['ID'])

In [2]:
import pandas as pd
from sklearn.decomposition import PCA

# Step 1: Perform PCA
pca = PCA()
pca.fit(data)

# Step 2: Get explained variance ratio and loadings
explained_variance = pca.explained_variance_ratio_
loadings = pca.components_

# Step 3: Identify top features based on loadings and variance
top_features = []

# Loop through each component and sort the features by their loading values
for i in range(len(explained_variance)):
    feature_loading = loadings[i]
    sorted_loading_indices = feature_loading.argsort()[::-1]
    
    # Collect feature names for the current component
    feature_names = data.columns[sorted_loading_indices]
    
    # Append all features with their corresponding explained variance
    for feature in feature_names:
        top_features.append((feature, explained_variance[i]))

# Step 4: Retain only the unique features with the highest variance
unique_features = {}
for feature, variance in top_features:
    if feature not in unique_features or variance > unique_features[feature]:
        unique_features[feature] = variance

# Step 5: Sort the unique features by variance in descending order and limit to top 100
unique_features_sorted = sorted(unique_features.items(), key=lambda x: x[1], reverse=True)[:100]

# Step 6: Output the top 100 unique features with variance
print("Top 80 Unique Features with Variance:")
for feature, variance in unique_features_sorted:
    print(f"{feature} - variance: {variance:.4f}")

# Optional: Save the result to a CSV file
#output_df = pd.DataFrame(unique_features_sorted, columns=["Feature", "Variance"])
#output_df.to_csv("top_80_features_with_variance.csv", index=False)


Top 80 Unique Features with Variance:
Area - variance: 0.1954
Length - variance: 0.1954
SolarTech_Outside - variance: 0.1954
Class_Ground - Rural - variance: 0.1954
UrbanRural_Rural - variance: 0.1954
HIFLD_CAISO - variance: 0.1954
DistSub_200_binned_far - variance: 0.1954
County_Kern - variance: 0.1954
DistSub_100_binned_moderate - variance: 0.1954
DistSub_100_binned_far - variance: 0.1954
Substation_100_Unknown - variance: 0.1954
DistSub_200_binned_very far - variance: 0.1954
HIFLD_100 - variance: 0.1954
Substation_200_Whirlwind - variance: 0.1954
HIFLD_200 - variance: 0.1954
Substation_200_Unknown - variance: 0.1954
Substation_100_Whirlwind - variance: 0.1954
Class_Ground - Urban - variance: 0.1954
Substation_CAISO_Whirlwind - variance: 0.1954
DistSub_CAISO_binned_far - variance: 0.1954
County_Tulare - variance: 0.1954
Substation_200_Arco - variance: 0.1954
Substation_CAISO_Antelope - variance: 0.1954
DistSub_CAISO_binned_moderate - variance: 0.1954
County_Riverside - variance: 0.19

In [3]:
# Extract top feature names from the sorted list of unique features
top_feature_names = [feature for feature, _ in unique_features_sorted]

# Subset the original dataset using the top 100 features
top_features_dataset = data[top_feature_names]

In [4]:
top_features_dataset.head()

Unnamed: 0,Area,Length,SolarTech_Outside,Class_Ground - Rural,UrbanRural_Rural,HIFLD_CAISO,DistSub_200_binned_far,County_Kern,DistSub_100_binned_moderate,DistSub_100_binned_far,...,Substation_100_Goose Lake,Substation_CAISO_Mc Mullin,Substation_100_Waukena,Substation_100_Cressey,Substation_CAISO_Randsburg,Substation_CAISO_Shafter,Substation_200_San Luis 3,Substation_100_Schindler,Substation_CAISO_Tranquility,Substation_100_Tranquility
0,-0.534259,-0.455304,0,0,0,1.589278,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.469035,-0.451893,0,0,0,1.589278,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.64959,-0.686802,0,0,0,1.589278,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.606433,-0.520224,0,0,0,1.589278,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.734261,-0.707137,0,0,0,1.589278,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
top_features_dataset.shape

(5397, 100)

In [6]:
top_features_dataset.to_csv("top_features.csv", index=False)