In [1]:
import pandas as pd
import numpy as np
import os.path
root_path = os.path.dirname(os.getcwd())

# Import video data (thanks Cowen & Keltner!)
video_data = pd.read_csv(os.path.join(root_path, 'DATA/video_data.csv'))

In [2]:
# Separate out 34 emotion category scores, downcase column names
# Fraction of respondents who chose from multiple-select
video_category_data = video_data.iloc[:, 1:35]
video_category_data.index = video_data['Filename']

# Separate out 14 affective dimension scores
# 1-9 ratings
video_dimension_data = video_data.iloc[:, 35:49]
video_dimension_data.index = video_data['Filename']

# Separate 600 free response term scores
# Fraction of respondants who chose from multiple-select dropdown
video_term_data = video_data.iloc[:, 49:]
video_term_data.index = video_data['Filename']

# Select distinct category scores
# Subset of category scores
video_distinct_category_data = video_data[[
    'Admiration',
    'Adoration',
    'Aesthetic Appreciation',
    'Amusement',
    'Anger',
    'Anxiety',
    'Awe',
    'Awkwardness',
    'Boredom',
    'Calmness',
    'Confusion',
    'Craving',
    'Disgust',
    'Empathic Pain',
    'Entrancement',
    'Excitement',
    'Fear',
    'Horror',
    'Interest',
    'Joy',
    'Nostalgia',
    'Relief',
    'Romance',
    'Sadness',
    'Satisfaction',
    'sexual desire',
    'Surprise'
]]
video_distinct_category_data.index = video_data['Filename']

In [3]:
# Data supported matrices (v_to_c.dot(video_vector) = other_vector)
v_to_c = video_category_data.transpose()
v_to_d = video_dimension_data.transpose()
v_to_t = video_term_data.transpose()
v_to_dc = video_distinct_category_data.transpose()

In [4]:
# Inverse matrices
c_to_v = pd.DataFrame(np.linalg.pinv(v_to_c.values), v_to_c.columns, v_to_c.index)
d_to_v = pd.DataFrame(np.linalg.pinv(v_to_d.values), v_to_d.columns, v_to_d.index)
t_to_v = pd.DataFrame(np.linalg.pinv(v_to_t.values), v_to_t.columns, v_to_t.index)
dc_to_v = pd.DataFrame(np.linalg.pinv(v_to_dc.values), v_to_dc.columns, v_to_dc.index)

In [5]:
# Category inference matrices
c_to_d = v_to_d.dot(c_to_v)
c_to_t = v_to_t.dot(c_to_v)
c_to_dc = v_to_dc.dot(c_to_v)

In [6]:
# Dimension inference matrices
d_to_c = v_to_c.dot(d_to_v)
d_to_t = v_to_t.dot(d_to_v)
d_to_dc = v_to_dc.dot(d_to_v)

In [7]:
# Term inference matrices
t_to_c = v_to_c.dot(t_to_v)
t_to_d = v_to_d.dot(t_to_v)
t_to_dc = v_to_dc.dot(t_to_v)

In [8]:
# Distinct category inference matrices
dc_to_c = v_to_c.dot(dc_to_v)
dc_to_d = v_to_d.dot(dc_to_v)
dc_to_t = v_to_t.dot(dc_to_v)

In [9]:
# SLOW IF FIRST TIME
# Install and import Spacy natural language utility
!pip install spacy
import spacy

# Load large english language model
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_lg')

[31mtensorflow 1.11.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Traceback (most recent call last):
  File "/home/russell/anaconda3/lib/python3.6/site-packages/urllib3/connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/home/russell/anaconda3/lib/python3.6/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/home/russell/anaconda3/lib/python3.6/socket.py", line 745, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/russe

In [10]:
# Install and import spelling corrector
!pip install autocorrect
from autocorrect import spell

# Define function to normalize phrase
def normalize(text):
    return ' '.join([spell(word) for word in text.split()])

# Define function to lemmatize phrase
def lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

[31mtensorflow 1.11.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [11]:
# Create dataframe for terms, lemmas, docs
term_data = pd.DataFrame([])
term_data['term'] = video_term_data.columns
term_data['term_lemma'] = term_data['term'].apply(lemmatize)
term_data['term_doc'] = term_data['term_lemma'].apply(nlp)

In [12]:
# NOT USED
import math

def normalize_vector(vector):
    magnitude = math.sqrt((vector * vector).sum())
    print(magnitude)
    return vector / magnitude

In [13]:
# NOT USED
# Define function to select top n vector components
def skim(vector, n):
    top = vector.sort_values(ascending=False).head(n).index
    vector[~vector.index.isin(top)] = 0
    return vector

In [14]:
# Define function to get term vector from phrase
def get_term_vector(phrase, n_terms):
    phrase_normal = normalize(phrase)
    phrase_lemma = lemmatize(phrase_normal)
    phrase_doc = nlp(phrase_lemma)
    
    term_vector = term_data['term_doc'].apply(phrase_doc.similarity)
    term_vector.index = term_data['term']
    
    if term_vector.max() == 1:
        n_terms = 1
        
    top_terms = term_vector.sort_values(ascending=False).head(n_terms).index
    term_vector[~term_vector.index.isin(top_terms)] = 0
    
    return normalize_vector(term_vector)

In [15]:
# QUALITY CHECK
term_vector = get_term_vector("I'm feeling pissed", 3)
print(term_vector.sort_values(ascending=False).head())
video_vector = t_to_v.dot(term_vector)
print(video_vector.sort_values(ascending=False).head())
print(v_to_t.dot(video_vector).sort_values(ascending=False).head())

1.3282042488044308
term
feeling mad           0.578188
feeling pissed off    0.577968
feeling scared        0.575892
zeal                  0.000000
feeling active        0.000000
Name: term_doc, dtype: float64
Filename
2067.mp4    1.308859
0070.mp4    1.187956
2116.mp4    1.065901
2115.mp4    0.959063
0226.mp4    0.836406
dtype: float64
feeling mad           5.781877e-01
feeling pissed off    5.779679e-01
feeling scared        5.758924e-01
disgust               1.387779e-15
shock                 8.881784e-16
dtype: float64


In [16]:
def interpret_phrase(phrase, n_terms):
    term_vector = get_term_vector(phrase, n_terms)
    print('FREE RESPONSE TERMS:\n', term_vector.sort_values(ascending=False).head(n_terms), '\n')
    
    video_vector = t_to_v.dot(term_vector)
    print('VIDEOS:\n', video_vector.sort_values(ascending=False).head(), '\n')
    
    category_vector = v_to_c.dot(video_vector)
    print('CATEGORIES:\n', category_vector.sort_values(ascending=False), '\n')
    
    dimension_vector = v_to_d.dot(video_vector) 
    print('AFFECTIVE DEMENSIONS:\n', dimension_vector, '\n')
    
    distinct_category_vector = v_to_dc.dot(video_vector)
    print('DISTINCT CATEGORIES:\n', distinct_category_vector.sort_values(ascending=False), '\n')


In [17]:
interpret_phrase("I'm stressed", 2)

0.9769273031838307
FREE RESPONSE TERMS:
 term
stress     0.783201
anxiety    0.621768
Name: term_doc, dtype: float64 

VIDEOS:
 Filename
1430.mp4    0.431187
1838.mp4    0.427992
0249.mp4    0.422800
1720.mp4    0.403262
1079.mp4    0.401742
dtype: float64 

CATEGORIES:
 Fear                      0.729677
Anxiety                   0.482862
Sadness                   0.424617
Relief                    0.312148
Surprise                  0.217009
Excitement                0.180738
Joy                       0.111558
Awe                       0.102870
Triumph                   0.088460
Disgust                   0.075353
Nostalgia                 0.063591
Contempt                  0.054838
Disappointment            0.052450
Interest                  0.024801
Sympathy                  0.007826
Entrancement              0.005038
Calmness                  0.002734
Aesthetic Appreciation   -0.000630
Sexual Desire            -0.011239
Adoration                -0.013040
Satisfaction             -0.

In [18]:
#TAKEWAYS
# Not great
# doesn't reliably find best match
# term dataset covers a very broad range but what I need is more nuance on common feelings
# could try universal sentence encoder, but I think that'll be slow and not much better

# THINGS TO TRY NEXT
# Training a learning model to extract terms effectively
# Assuming this is a+ and moving on to how I'd use it (to give me more info on what I'll need)

# LATER NOTE
# wasn't inverting matricies, still not perfect

# THEORY
# Could be that it's giving the most likely analysis given base rate likelihood of various things?
# How could I account for that - subtract the average vector?

In [19]:
mean_term_vector = v_to_t.mean(axis=1)

In [20]:
# PRETTY SWEET QUALITY CHECK
# The mean term vector is the result of watching each video 1/2000 times, sorta
t_to_v.dot(mean_term_vector).sum()

0.9799621013450808

In [21]:
def interpret_phrases(phrases):
    n_terms = 2
    
    term_vectors = pd.DataFrame([])
    video_vectors = pd.DataFrame([])
    dimension_vectors = pd.DataFrame([])
    distinct_category_vectors = pd.DataFrame([])
    
    n = 1
    for phrase in phrases:
        phrase_n = 'phrase_' + str(n)
        
        term_vectors[phrase_n] = get_term_vector(phrase, n_terms)
        video_vectors[phrase_n] = t_to_v.dot(term_vector)
        dimension_vectors[phrase_n] = v_to_d.dot(video_vector)
        distinct_category_vectors[phrase_n] = v_to_dc.dot(video_vector)
        n += 1
        
    term_vectors['mean'] = term_vectors.mean(axis=1)
    video_vectors['mean'] = video_vectors.mean(axis=1)
    dimension_vectors['mean'] = dimension_vectors.mean(axis=1)
    distinct_category_vectors['mean'] = distinct_category_vectors.mean(axis=1)
    
    term_vectors = term_vectors.round(2)
    video_vectors = video_vectors.round(2)
    dimension_vectors = dimension_vectors.round(2)
    distinct_category_vectors = distinct_category_vectors.round(2)

    print('FREE RESPONSE TERMS:\n', term_vectors.sort_values(by='mean', ascending=False).head(len(phrases) * n_terms), '\n')
    print('VIDEOS:\n', video_vectors.sort_values(by='mean', ascending=False).head(), '\n')
    print('AFFECTIVE DEMENSIONS:\n', dimension_vectors, '\n')
    print('DISTINCT CATEGORIES:\n', distinct_category_vectors.sort_values(by='mean', ascending=False), '\n')


In [22]:
interpret_phrases(["I feel stressed", "I'm feeling better", "I'm feeling hopeful"])

1.1915820976840932
1.0913203285671425
1.0441569909036765
FREE RESPONSE TERMS:
                       phrase_1  phrase_2  phrase_3  mean
term                                                    
feeling scared            0.71      0.00      0.00  0.24
feeling overwhelmed       0.71      0.00      0.00  0.24
feeling overjoyed         0.00      0.00      0.71  0.24
feeling apprehensive      0.00      0.00      0.71  0.24
feeling alone             0.00      0.72      0.00  0.24
feeling important         0.00      0.70      0.00  0.23 

VIDEOS:
           phrase_1  phrase_2  phrase_3  mean
Filename                                    
2067.mp4      1.31      1.31      1.31  1.31
0070.mp4      1.19      1.19      1.19  1.19
2116.mp4      1.07      1.07      1.07  1.07
2115.mp4      0.96      0.96      0.96  0.96
0226.mp4      0.84      0.84      0.84  0.84 

AFFECTIVE DEMENSIONS:
              phrase_1  phrase_2  phrase_3  mean
approach         0.98      0.98      0.98  0.98
arousal          3

Thank you for using SenticNet 5!

Please acknowledge the authors by citing the following publication in any research work or presentation containing results obtained in whole or in part through the use of SenticNet 5:

E Cambria, S Poria, D Hazarika, K Kwok. SenticNet 5: Discovering conceptual primitives for sentiment analysis by means of context embeddings. In: AAAI, pp. 1795-1802 (2018)

In [27]:
#senticnet['concept_name'] = ['pleasantness_value', 'attention_value', 'sensitivity_value', 'aptitude_value', 'primary_mood', 'secondary_mood', 'polarity_label', 'polarity_value', 'semantics1', 'semantics2', 'semantics3', 'semantics4', 'semantics5']
%run '../DATA/senticnet5.py'

# rename loaded dictionary
senticnet_dict = senticnet

In [31]:
# From file
senticnet_columns = ['concept_name', 'pleasantness_value', 'attention_value', 'sensitivity_value', 'aptitude_value', 'primary_mood', 'secondary_mood', 'polarity_label', 'polarity_value', 'semantics1', 'semantics2', 'semantics3', 'semantics4', 'semantics5']

# Dict to 2d array
senticnet_data = [[concept] + senticnet_dict[concept] for concept in senticnet_dict]

# Create senticnet dataframe
senticnet = pd.DataFrame(data=senticnet_data, columns=senticnet_columns)

# Display
print(senticnet.shape)
senticnet.head()

(100000, 14)


Unnamed: 0,concept_name,pleasantness_value,attention_value,sensitivity_value,aptitude_value,primary_mood,secondary_mood,polarity_label,polarity_value,semantics1,semantics2,semantics3,semantics4,semantics5
0,a_little,-0.99,0.0,0.0,-0.7,#sadness,#disgust,negative,-0.84,least,little,small_amount,shortage,scarce
1,a_little_hungry,0.757,0.0,0.0,0.0,#joy,#joy,positive,0.757,get_full,hunger_go_away,feel_full,hunger,full
2,a_little_specific,0.089,0.133,-0.1,0.126,#interest,#admiration,positive,0.06,clan,happy_together,many_people,aunt_uncle,human_group
3,a_lot,0.277,0.161,0.0,0.337,#joy,#admiration,positive,0.258,many,plenty,big_amount,abundant,good_number
4,a_lot_of_books,0.0,0.076,0.0,0.066,#interest,#admiration,positive,0.071,library,old_testament,cook_book,magazine,librarian
