# PCA

## Usage

Update the constants below to work with your file structure

HYPERSPECRAL_VECTOR_CSV_PATH this should be the path to a csv file cointaining the hyperspectral data to be clustered using PCA.

PCA_COMPONENTS this is the number of PFT clusters to be graphed. This figure can change, however, any changes need to be reflected in the COLUMNS.

COLUMNS this is the name of each column in the PCA pca dataframe. The number of strings needs to reflect the PCA_COMPONENTS value.

CLASS_NAME this is the name of the class to undergo analysis, in this case 'PFT'.

PFTS this is the list of plant functional types included in the dataframe

COLORS this is a list of colours to be used when graphing the PCA results

In [None]:
HYPERSPECRAL_VECTOR_CSV_PATH = 'preprocessed_data/site_with_hyperspectral_data.csv'
PCA_COMPONENTS = 10
COLUMNS = ['pc_1', 'pc_2', 'pc_3', 'pc_4', 'pc_5', 'pc_6', 'pc_7', 'pc_8', 'pc_9', 'pc_10']
CLASS = 'PFT'
PFTS = ['bare', 'brash', 'water', 'rushes', 'pool_bogbean', 'short_grass', 'long_grass', 'grass_sphagnum', 
        'dead_grass_mix', 'shrub_sphagnum']
COLORS = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'darkorange', 'lime', 'aqua', 'fuchsia', 'yellowgreen', 'purple']

In [None]:
import pandas as pd
import geopandas

data = pd.read_csv(HYPERSPECRAL_VECTOR_CSV_PATH)

In [None]:
import numpy as np
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler

# The numbers will need to be changed according to the number of bands in the hyperspectral dataset
x = data[data.columns[1:359]]
y = data[data.columns[362]]

x = StandardScaler().fit_transform(x)


from sklearn.decomposition import PCA
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = COLUMNS)


pca_df = pd.concat([principalDf, data[[CLASS]], data[['geometry']]], axis = 1)


pca_df['geometry'] = geopandas.GeoSeries.from_wkt(pca_df['geometry'])

pca_gdf = geopandas.GeoDataFrame(pca_df, geometry='geometry')

pca_gdf.head(2)


In [None]:
from matplotlib import pyplot as plt

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

pfts = sorted(PFTS)

colors = COLORS

for pft, color in zip(pfts,colors):
    indicesToKeep = pca_df['PFT'] == pft
    ax.scatter(pca_df.loc[indicesToKeep, 'pc_1']
               , pca_df.loc[indicesToKeep, 'pc_2']
               , c = color
               , s = 50)
ax.legend(PFTs)
ax.grid()

In [None]:
explained_variance = pca.explained_variance_ratio_

explained_variance

In [None]:
var = pca.explained_variance_[0:10] #percentage of variance explained
labels = COLUMNS

plt.figure(figsize=(15,7))
plt.bar(labels,var,)
plt.xlabel('Pricipal Component')
plt.ylabel('Proportion of Variance Explained')