In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

https://towardsdatascience.com/pca-clearly-explained-how-when-why-to-use-it-and-feature-importance-a-guide-in-python-7c274582c37e
https://shankarmsy.github.io/posts/pca-sklearn.html
https://stats.stackexchange.com/questions/311908/what-is-pca-components-in-sk-learn

In [3]:
# Read csv
X_df = pd.read_csv('./videos_relabelled.csv', index_col=None)

## North America

In [4]:
df = X_df[(X_df.emotion == 'anger') & (X_df.culture == 'North America')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


## Print PCs for NA

In [5]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())


[[-0.341 -0.232  0.03  -0.28   0.454  0.409  0.424  0.322  0.128  0.108
   0.085  0.027 -0.011  0.239 -0.02  -0.016]
 [ 0.39   0.462  0.014  0.083  0.116  0.028 -0.123  0.27   0.196  0.083
   0.089 -0.005 -0.009  0.552  0.374  0.164]
 [ 0.171  0.017 -0.099  0.764  0.278  0.221  0.036  0.108  0.153  0.04
  -0.034  0.035  0.026 -0.084 -0.407 -0.192]]
Explained Variance:
[0.203 0.149 0.116]


In [5]:
feat_df

Unnamed: 0,0,1
0,PC0,AU06_r
1,PC1,AU25_r
2,PC2,AU05_r


##  Persian

In [6]:
df = X_df[(X_df.emotion == 'anger') & (X_df.culture == 'Persian')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)

Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [7]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[-0.147 -0.097  0.432 -0.301  0.304  0.519  0.345  0.268 -0.004  0.122
   0.158  0.067  0.013  0.293  0.054  0.07 ]
 [ 0.495  0.173 -0.002  0.224  0.094 -0.321 -0.016  0.571  0.198  0.265
   0.139  0.067  0.008  0.303 -0.068  0.081]
 [-0.248  0.181 -0.669  0.256  0.253  0.329 -0.029 -0.009  0.127 -0.099
  -0.055  0.037 -0.013  0.428  0.075  0.021]]
Explained Variance:
[0.282 0.128 0.099]


In [8]:
feat_df

Unnamed: 0,0,1
0,PC0,AU07_r
1,PC1,AU10_r
2,PC2,AU04_r


## Philippines

In [9]:
df = X_df[(X_df.emotion == 'anger') & (X_df.culture == 'Philippines')]
au_values_df = df.drop(columns = ['success','confidence', 'face_id','frame', 'culture','filename', 'emotion', 'gender', 'talking'])
le = LabelEncoder()
# mean normalization - Pandas automatically applies colomn-wise function in the code above.
# normalized_df=(au_values_df-au_values_df.mean())/au_values_df.std()
normalized_df=(au_values_df-au_values_df.min())/(au_values_df.max()-au_values_df.min())
pca = PCA(n_components=3)
pca_anger = pca.fit_transform(normalized_df.values)
# pca_anger.components_
feature_names = au_values_df.columns
print(feature_names)


Index(['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
       'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
       'AU26_r', 'AU45_r'],
      dtype='object')


In [10]:
np.set_printoptions(precision=3, suppress=True)
print(pca.components_)
print("Explained Variance:")
print(pca.explained_variance_ratio_)
n_pcs= pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}
feat_df = pd.DataFrame(dic.items())

[[ 0.429  0.503 -0.007  0.401  0.122  0.096 -0.     0.282  0.374  0.204
  -0.049  0.097  0.013  0.296  0.074 -0.11 ]
 [-0.271 -0.319  0.032 -0.247  0.396  0.47   0.284  0.362  0.274  0.214
   0.07  -0.059  0.001  0.19  -0.064  0.073]
 [-0.077  0.148  0.124  0.034 -0.049  0.506  0.105 -0.183 -0.301 -0.452
  -0.031 -0.07  -0.004  0.367  0.388 -0.257]]
Explained Variance:
[0.207 0.172 0.115]


In [11]:
feat_df

Unnamed: 0,0,1
0,PC0,AU02_r
1,PC1,AU07_r
2,PC2,AU07_r
