In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

https://towardsdatascience.com/pca-clearly-explained-how-when-why-to-use-it-and-feature-importance-a-guide-in-python-7c274582c37e
https://shankarmsy.github.io/posts/pca-sklearn.html
https://stats.stackexchange.com/questions/311908/what-is-pca-components-in-sk-learn

In [2]:
# Read csv
X_df = pd.read_csv('./videos_relabelled.csv', index_col=None)

## North America

In [3]:
df = X_df[(X_df.emotion == 'contempt') & (X_df.culture == 'North America')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


## Print PCs for NA

In [4]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())


[[-0.223 -0.502  0.249 -0.169  0.298  0.376  0.344  0.217  0.032  0.379
   0.05  -0.044  0.005 -0.073 -0.172 -0.168]
 [ 0.035  0.224 -0.038 -0.321  0.199  0.036  0.074  0.314  0.391  0.084
  -0.141 -0.19  -0.058  0.365  0.063  0.585]
 [ 0.271  0.229 -0.211  0.221  0.125 -0.062 -0.2    0.368  0.384  0.323
   0.04   0.322  0.081 -0.08  -0.355 -0.3  ]]
Explained Variance:
[0.292 0.155 0.126]


In [5]:
feat_df

Unnamed: 0,0,1
0,PC0,AU02_r
1,PC1,AU45_r
2,PC2,AU12_r


##  Persian

In [6]:
df = X_df[(X_df.emotion == 'contempt') & (X_df.culture == 'Persian')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [7]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[ 0.031 -0.039 -0.007 -0.493  0.289  0.16   0.155  0.338  0.078  0.275
   0.415  0.386  0.095 -0.022  0.06   0.301]
 [ 0.205 -0.168 -0.401 -0.277 -0.028  0.486  0.305 -0.134  0.022  0.172
  -0.216 -0.42  -0.019  0.235  0.171 -0.08 ]
 [-0.317 -0.007  0.417  0.348  0.296  0.309  0.488 -0.058  0.008  0.276
  -0.089  0.052  0.007  0.056 -0.247 -0.17 ]]
Explained Variance:
[0.236 0.135 0.115]


In [8]:
feat_df

Unnamed: 0,0,1
0,PC0,AU05_r
1,PC1,AU07_r
2,PC2,AU09_r


## Philippines

In [9]:
df = X_df[(X_df.emotion == 'contempt') & (X_df.culture == 'Philippines')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)


Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [10]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[ 0.477  0.359  0.043  0.247  0.271 -0.195 -0.061  0.406  0.353  0.133
   0.012  0.153  0.005  0.336  0.117 -0.09 ]
 [-0.29  -0.5   -0.069 -0.189  0.386  0.299  0.307  0.406  0.226  0.099
   0.07   0.001  0.016  0.247  0.009  0.038]
 [-0.138  0.235 -0.242 -0.252 -0.185 -0.038  0.11  -0.068 -0.08  -0.062
  -0.05  -0.138 -0.019  0.404  0.686  0.293]]
Explained Variance:
[0.229 0.18  0.127]


In [11]:
feat_df

Unnamed: 0,0,1
0,PC0,AU01_r
1,PC1,AU02_r
2,PC2,AU26_r
