In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

https://towardsdatascience.com/pca-clearly-explained-how-when-why-to-use-it-and-feature-importance-a-guide-in-python-7c274582c37e
https://shankarmsy.github.io/posts/pca-sklearn.html
https://stats.stackexchange.com/questions/311908/what-is-pca-components-in-sk-learn

In [2]:
# Read csv
X_df = pd.read_csv('./videos_relabelled.csv', index_col=None)

## North America

In [3]:
df = X_df[(X_df.emotion == 'disgust') & (X_df.culture == 'North America')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


## Print PCs for NA

In [4]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())


[[-0.148 -0.056 -0.013 -0.141  0.526  0.427  0.277  0.384  0.375  0.204
  -0.057 -0.012  0.015  0.273  0.093 -0.054]
 [ 0.25  -0.093  0.834 -0.029 -0.043 -0.002 -0.101  0.336 -0.067 -0.066
  -0.064  0.134 -0.006 -0.022 -0.111 -0.253]
 [ 0.066  0.007  0.018 -0.203  0.168 -0.216 -0.095  0.081  0.339  0.26
  -0.124  0.21   0.045 -0.541 -0.353  0.45 ]]
Explained Variance:
[0.249 0.145 0.118]


In [5]:
feat_df

Unnamed: 0,0,1
0,PC0,AU06_r
1,PC1,AU04_r
2,PC2,AU25_r


##  Persian

In [6]:
df = X_df[(X_df.emotion == 'disgust') & (X_df.culture == 'Persian')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [7]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[ 0.084  0.103  0.395  0.272  0.389 -0.086  0.105  0.414  0.44   0.176
   0.216  0.341 -0.008  0.066 -0.132  0.043]
 [-0.162 -0.18  -0.155 -0.432  0.106  0.366  0.356  0.154 -0.236  0.064
   0.416  0.26   0.041 -0.214 -0.124  0.268]
 [-0.249 -0.181 -0.076 -0.075  0.346  0.341  0.229  0.14   0.154  0.203
  -0.237 -0.453 -0.065  0.468  0.177 -0.063]]
Explained Variance:
[0.302 0.197 0.128]


In [8]:
feat_df

Unnamed: 0,0,1
0,PC0,AU12_r
1,PC1,AU05_r
2,PC2,AU25_r


## Philippines

In [9]:
df = X_df[(X_df.emotion == 'disgust') & (X_df.culture == 'Philippines')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)


Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [10]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[ 0.16  -0.201  0.491 -0.146  0.418  0.309  0.234  0.441  0.145  0.17
   0.073  0.009  0.001  0.264  0.162  0.033]
 [-0.168 -0.411  0.176 -0.036 -0.094 -0.043 -0.081 -0.02  -0.098  0.163
   0.534  0.485  0.141 -0.314 -0.268  0.08 ]
 [ 0.76   0.166  0.311  0.171 -0.187 -0.116 -0.428  0.044  0.013  0.084
  -0.032  0.072  0.022 -0.091 -0.087  0.074]]
Explained Variance:
[0.254 0.143 0.116]


In [11]:
feat_df

Unnamed: 0,0,1
0,PC0,AU04_r
1,PC1,AU15_r
2,PC2,AU01_r
