# Feature Reduction

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
directory = 'theFeaturesByBreed'

# Initialize an empty list to store DataFrames
dfs = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

In [3]:
# Convert all the features to numpy array
df["Features"] = df["Features"].map(lambda features: np.array(features[1:-1].split(), dtype=np.double))

In [4]:
# Display the final DataFrame
df.head()

Unnamed: 0,Breed,Features
0,scottish_deerhound,"[-3.21758747, -2.06748104, -0.873703837, -1.29..."
1,scottish_deerhound,"[-2.13198233, -3.66261029, 0.455072135, -0.441..."
2,scottish_deerhound,"[-5.51163054, -4.45282364, -0.953505039, 0.499..."
3,scottish_deerhound,"[-1.66767263, -4.22921705, -1.13879931, -1.844..."
4,scottish_deerhound,"[-4.4798193, -4.92072344, -1.58877051, 0.64211..."


In [5]:
# Step 1: Standardize the data
X = np.vstack(df["Features"])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Step 2: Apply PCA
n_components = 100 # Number of principal components to retain
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [7]:
features = pd.DataFrame(X_pca, columns=[f"Feature {i}" for i in range(n_components)])
features.head()

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,2.687539,3.932317,-3.243325,2.071663,6.351608,-13.571045,3.075167,-2.453783,-4.240095,-4.702732,...,1.208537,0.615094,1.057965,-1.796298,0.20002,-2.279731,-0.633079,-0.158532,0.300838,-1.116885
1,12.965803,-0.565486,0.352435,-4.360752,4.687381,-6.993444,0.196068,-6.633534,-1.871662,-7.428077,...,-0.742694,-0.172587,1.26074,-1.88174,0.182333,-0.560648,0.773332,-1.526032,0.450895,-0.176342
2,22.904961,-2.710478,4.040229,1.884548,11.017559,-7.424705,0.592055,8.473251,4.439982,-2.135496,...,0.643105,-0.818507,-1.288865,0.393766,-0.731501,-0.225677,-1.525502,0.125807,-1.557578,0.312391
3,19.225068,4.136729,1.859559,-2.977302,8.724263,-4.673608,1.022031,-3.917459,-1.901546,-6.503607,...,-1.538805,-1.504374,0.099333,0.969051,-0.389717,-1.16778,0.81062,-0.008199,-0.145861,0.090023
4,33.098792,0.000445,10.866465,3.810472,9.132536,-7.464697,2.522165,5.504714,0.360156,2.719886,...,1.228905,0.348156,-2.519471,0.090175,0.430125,-0.00268,-0.488536,0.481276,-0.982391,1.193886


In [8]:
df = pd.concat([df.drop("Features", axis = 1), features], axis = 1)

In [9]:
df.head()

Unnamed: 0,Breed,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,...,Feature 90,Feature 91,Feature 92,Feature 93,Feature 94,Feature 95,Feature 96,Feature 97,Feature 98,Feature 99
0,scottish_deerhound,2.687539,3.932317,-3.243325,2.071663,6.351608,-13.571045,3.075167,-2.453783,-4.240095,...,1.208537,0.615094,1.057965,-1.796298,0.20002,-2.279731,-0.633079,-0.158532,0.300838,-1.116885
1,scottish_deerhound,12.965803,-0.565486,0.352435,-4.360752,4.687381,-6.993444,0.196068,-6.633534,-1.871662,...,-0.742694,-0.172587,1.26074,-1.88174,0.182333,-0.560648,0.773332,-1.526032,0.450895,-0.176342
2,scottish_deerhound,22.904961,-2.710478,4.040229,1.884548,11.017559,-7.424705,0.592055,8.473251,4.439982,...,0.643105,-0.818507,-1.288865,0.393766,-0.731501,-0.225677,-1.525502,0.125807,-1.557578,0.312391
3,scottish_deerhound,19.225068,4.136729,1.859559,-2.977302,8.724263,-4.673608,1.022031,-3.917459,-1.901546,...,-1.538805,-1.504374,0.099333,0.969051,-0.389717,-1.16778,0.81062,-0.008199,-0.145861,0.090023
4,scottish_deerhound,33.098792,0.000445,10.866465,3.810472,9.132536,-7.464697,2.522165,5.504714,0.360156,...,1.228905,0.348156,-2.519471,0.090175,0.430125,-0.00268,-0.488536,0.481276,-0.982391,1.193886


In [10]:
df.to_csv("features.csv", index=False)

In [12]:
# Save PCA as picle file
with open('pca.pkl', 'wb') as f:
    pickle.dump(pca, f)