# Feature Reduction

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [4]:
directory = 'theFeaturesByBreed'

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

Unnamed: 0,Breed,Features
0,brittany_spaniel,[-1.88358617e+00 9.88821387e-01 -2.32127357e+...
1,brittany_spaniel,[-2.58102751e+00 -3.11713427e-01 -2.21587586e+...
2,brittany_spaniel,[-1.73865843e+00 8.22232246e-01 -3.19162250e+...
3,brittany_spaniel,[-5.35894823e+00 -5.29233932e-01 -1.35641754e+...
4,brittany_spaniel,[-1.19886541e+00 1.30478740e+00 -3.39355111e+...


In [13]:
# Convert all the features to numpy array
df["Features"] = df["Features"].map(lambda features: np.array(features[1:-1].split(), dtype=np.double))

In [14]:
# Display the final DataFrame
df.head()

Unnamed: 0,Breed,Features
0,brittany_spaniel,"[-1.88358617, 0.988821387, -2.32127357, -3.849..."
1,brittany_spaniel,"[-2.58102751, -0.311713427, -2.21587586, -2.93..."
2,brittany_spaniel,"[-1.73865843, 0.822232246, -3.1916225, -3.8995..."
3,brittany_spaniel,"[-5.35894823, -0.529233932, -1.35641754, -2.33..."
4,brittany_spaniel,"[-1.19886541, 1.3047874, -3.39355111, -3.00793..."


In [15]:
# Step 1: Standardize the data
X = np.vstack(df["Features"])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
# Step 2: Apply PCA
n_components = 100 # Number of principal components to retain
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [17]:
df["Features"] = pd.Series(X_pca.tolist())

In [18]:
df.head()

Unnamed: 0,Breed,Features
0,brittany_spaniel,"[-5.2798704059921375, -2.2084259860538777, 5.2..."
1,brittany_spaniel,"[0.5240202369218108, -4.321692280259293, -5.80..."
2,brittany_spaniel,"[4.484348842924805, -11.409184465121882, -2.06..."
3,brittany_spaniel,"[8.630310757252287, -9.028895885313128, -4.177..."
4,brittany_spaniel,"[-6.459162673194447, -5.1783442764556655, 3.18..."


In [19]:
df.to_csv("features.csv", index=False)