# Feature Reduction

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [41]:
directory = 'theFeaturesByBreed'

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

In [42]:
# Convert all the features to numpy array
df["Features"] = df["Features"].map(lambda features: np.array(features[1:-1].split(), dtype=np.double))

In [43]:
# Display the final DataFrame
df.head()

Unnamed: 0,Breed,Features
0,brittany_spaniel,"[-1.88358617, 0.988821387, -2.32127357, -3.849..."
1,brittany_spaniel,"[-2.58102751, -0.311713427, -2.21587586, -2.93..."
2,brittany_spaniel,"[-1.73865843, 0.822232246, -3.1916225, -3.8995..."
3,brittany_spaniel,"[-5.35894823, -0.529233932, -1.35641754, -2.33..."
4,brittany_spaniel,"[-1.19886541, 1.3047874, -3.39355111, -3.00793..."


In [44]:
# Step 1: Standardize the data
X = np.vstack(df["Features"])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [45]:
# Step 2: Apply PCA
n_components = 100 # Number of principal components to retain
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [46]:
features = pd.DataFrame(X_pca, columns=[f"Feature {i}" for i in range(n_components)])
features.head()

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,-5.27987,-2.208426,5.211814,-8.099501,-13.18522,-0.753768,-3.015471,0.82537,-4.269372,10.004107,...,-0.465695,-0.674558,0.123558,-1.240681,-1.069499,-0.714136,1.011624,-0.225715,-0.22033,1.140998
1,0.52402,-4.321692,-5.808849,-0.097443,-13.157566,0.439865,-4.521323,-1.382092,-1.304059,-3.890085,...,-0.410429,0.396908,0.344853,0.243527,-0.416206,0.310021,0.025371,-0.221647,-0.306602,-0.855617
2,4.484349,-11.409184,-2.061785,-8.795961,-10.736951,-0.650287,0.697348,8.464226,-2.961214,6.29046,...,-1.486026,0.636264,-0.315544,1.400241,-0.116869,-0.253275,0.918089,-0.645903,0.545067,-1.27421
3,8.630311,-9.028896,-4.177602,-3.575223,-7.698362,-5.857273,-3.473359,6.915802,1.972978,6.266619,...,-1.718268,0.056995,0.152345,0.562226,0.279578,-0.277104,-0.956852,-0.531012,0.249064,-1.877944
4,-6.459163,-5.178344,3.182314,-6.884826,-2.66327,-0.779802,-0.788943,4.521932,-3.636744,-0.785807,...,-0.964177,0.368517,-0.274313,-0.534803,0.797063,-0.156896,1.027882,1.130965,0.696718,0.098893


In [49]:
df = pd.concat([df.drop("Features", axis = 1), features], axis = 1)

In [50]:
df.head()

Unnamed: 0,Breed,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,brittany_spaniel,-5.27987,-2.208426,5.211814,-8.099501,-13.18522,-0.753768,-3.015471,0.82537,-4.269372,...,-0.465695,-0.674558,0.123558,-1.240681,-1.069499,-0.714136,1.011624,-0.225715,-0.22033,1.140998
1,brittany_spaniel,0.52402,-4.321692,-5.808849,-0.097443,-13.157566,0.439865,-4.521323,-1.382092,-1.304059,...,-0.410429,0.396908,0.344853,0.243527,-0.416206,0.310021,0.025371,-0.221647,-0.306602,-0.855617
2,brittany_spaniel,4.484349,-11.409184,-2.061785,-8.795961,-10.736951,-0.650287,0.697348,8.464226,-2.961214,...,-1.486026,0.636264,-0.315544,1.400241,-0.116869,-0.253275,0.918089,-0.645903,0.545067,-1.27421
3,brittany_spaniel,8.630311,-9.028896,-4.177602,-3.575223,-7.698362,-5.857273,-3.473359,6.915802,1.972978,...,-1.718268,0.056995,0.152345,0.562226,0.279578,-0.277104,-0.956852,-0.531012,0.249064,-1.877944
4,brittany_spaniel,-6.459163,-5.178344,3.182314,-6.884826,-2.66327,-0.779802,-0.788943,4.521932,-3.636744,...,-0.964177,0.368517,-0.274313,-0.534803,0.797063,-0.156896,1.027882,1.130965,0.696718,0.098893


In [51]:
df.to_csv("features.csv", index=False)