In [52]:
import pandas as pd
import numpy as np
import os.path
root_path = os.path.dirname(os.getcwd())

# Import video data (thanks Cowen & Keltner!)
video_data = pd.read_csv(os.path.join(root_path, 'DATA/video_data.csv'))

Unnamed: 0,Filename,Admiration,Adoration,Aesthetic Appreciation,Amusement,Anger,Anxiety,Awe,Awkwardness,Boredom,...,vitality,vulnerability,warmth,weakness,weariness,wonder,worry,wrath,yearning,zeal
0,0001.mp4,0.0,0.0,0.083333,0.0,0.083333,0.083333,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0002.mp4,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0003.mp4,0.083333,0.0,0.083333,0.66667,0.0,0.0,0.0,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0004.mp4,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.11111,0.0,0.0,0.0,0.0,0.0
4,0005.mp4,0.0,0.0,0.0,0.0,0.0,0.33333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.11111,0.0,0.0,0.0


In [55]:
# Separate out 34 emotion category scores, downcase column names
# Fraction of respondents who chose from multiple-select
video_category_data = video_data.iloc[:, 1:35]
video_category_data.index = video_data['Filename']

# Separate out 14 affective dimension scores
# 1-9 ratings
video_dimension_data = video_data.iloc[:, 35:49]
video_dimension_data.index = video_data['Filename']

# Separate 600 free response term scores
# Fraction of respondants who chose from multiple-select dropdown
video_term_data = video_data.iloc[:, 49:]
video_term_data.index = video_data['Filename']

# Select distinct category scores
# Subset of category scores
video_distinct_category_data = video_data[[
    'Admiration',
    'Adoration',
    'Aesthetic Appreciation',
    'Amusement',
    'Anger',
    'Anxiety',
    'Awe',
    'Awkwardness',
    'Boredom',
    'Calmness',
    'Confusion',
    'Craving',
    'Disgust',
    'Empathic Pain',
    'Entrancement',
    'Excitement',
    'Fear',
    'Horror',
    'Interest',
    'Joy',
    'Nostalgia',
    'Relief',
    'Romance',
    'Sadness',
    'Satisfaction',
    'sexual desire',
    'Surprise'
]]
video_distinct_category_data.index = video_data['Filename']

In [56]:
# Data supported matrices (v_to_c.dot(video_vector) = other_vector)
v_to_c = video_category_data.transpose()
v_to_d = video_dimension_data.transpose()
v_to_t = video_term_data.transpose()
v_to_dc = video_distinct_category_data.transpose()

In [57]:
# Inverse matrices
c_to_v = pd.DataFrame(np.linalg.pinv(v_to_c.values), v_to_c.columns, v_to_c.index)
d_to_v = pd.DataFrame(np.linalg.pinv(v_to_d.values), v_to_d.columns, v_to_d.index)
t_to_v = pd.DataFrame(np.linalg.pinv(v_to_t.values), v_to_t.columns, v_to_t.index)
dc_to_v = pd.DataFrame(np.linalg.pinv(v_to_dc.values), v_to_dc.columns, v_to_dc.index)

In [58]:
# Category inference matrices
c_to_d = v_to_d.dot(c_to_v)
c_to_t = v_to_t.dot(c_to_v)
c_to_dc = v_to_dc.dot(c_to_v)

In [59]:
# Dimension inference matrices
d_to_c = v_to_c.dot(d_to_v)
d_to_t = v_to_t.dot(d_to_v)
d_to_dc = v_to_dc.dot(d_to_v)

In [60]:
# Term inference matrices
t_to_c = v_to_c.dot(t_to_v)
t_to_d = v_to_d.dot(t_to_v)
t_to_dc = v_to_dc.dot(t_to_v)

In [61]:
# Distinct category inference matrices
dc_to_c = v_to_c.dot(dc_to_v)
dc_to_d = v_to_d.dot(dc_to_v)
dc_to_t = v_to_t.dot(dc_to_v)

In [62]:
# SLOW IF FIRST TIME
# Install and import Spacy natural language utility
!pip install spacy
import spacy

# Load large english language model
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_lg')

[31mtensorflow 1.11.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /home/russell/anaconda3/lib/python3.6/site-packages/en_core_web_md -->
    /home/russell/anaconda3/lib/python3.6/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [63]:
# Install and import spelling corrector
!pip install autocorrect
from autocorrect import spell

# Define function to normalize phrase
def normalize(text):
    return ' '.join([spell(word) for word in text.split()])

# Define function to lemmatize phrase
def lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text)])

[31mtensorflow 1.11.0 has requirement setuptools<=39.1.0, but you'll have setuptools 40.4.3 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [64]:
# Create dataframe for terms, lemmas, docs
term_data = pd.DataFrame([])
term_data['term'] = video_term_data.columns
term_data['term_lemma'] = term_data['term'].apply(lemmatize)
term_data['term_doc'] = term_data['term_lemma'].apply(nlp)
term_data.head()

Unnamed: 0,term,term_lemma,term_doc
0,a surge of pride,a surge of pride,"(a, surge, of, pride)"
1,abhorrence,abhorrence,(abhorrence)
2,admiration,admiration,(admiration)
3,adoration,adoration,(adoration)
4,adrenaline rush,adrenaline rush,"(adrenaline, rush)"


In [65]:
# NOT USED
import math

def normalize_vector(vector):
    magnitude = math.sqrt((vector * vector).sum())
    print(magnitude)
    return vector / magnitude

In [96]:
# NOT USED
# Define function to select top n vector components
def skim(vector, n):
    top = vector.sort_values(ascending=False).head(n).index
    vector[~vector.index.isin(top)] = 0
    return vector

In [110]:
# Define function to get term vector from phrase
def get_term_vector(phrase, n_terms):
    phrase_normal = normalize(phrase)
    phrase_lemma = lemmatize(phrase_normal)
    phrase_doc = nlp(phrase_lemma)
    
    term_vector = term_data['term_doc'].apply(phrase_doc.similarity)
    term_vector.index = term_data['term']
    
    top_terms = term_vector.sort_values(ascending=False).head(n_terms).index
    term_vector[~term_vector.index.isin(top_terms)] = 0
    return term_vector

In [111]:
# QUALITY CHECK
term_vector = get_term_vector("I'm feeling pissed", 3)
print(term_vector.sort_values(ascending=False).head())
video_vector = t_to_v.dot(term_vector)
print(video_vector.sort_values(ascending=False).head())
print(v_to_t.dot(video_vector).sort_values(ascending=False).head())

term
feeling mad           0.767951
feeling pissed off    0.767659
feeling scared        0.764903
zeal                  0.000000
feeling active        0.000000
Name: term_doc, dtype: float64
Filename
2067.mp4    1.738433
0070.mp4    1.577848
2116.mp4    1.415734
2115.mp4    1.273831
0226.mp4    1.110918
dtype: float64
feeling mad           7.679514e-01
feeling pissed off    7.676594e-01
feeling scared        7.649027e-01
disgust               1.776357e-15
shock                 1.165734e-15
dtype: float64


In [112]:
def interpret_phrase(phrase, n_terms):
    term_vector = get_term_vector(phrase, n_terms)
    print('FREE RESPONSE TERMS:\n', term_vector.sort_values(ascending=False).head(n_terms), '\n')
    
    video_vector = normalize_vector(t_to_v.dot(term_vector))
    print('VIDEOS:\n', video_vector.sort_values(ascending=False).head(), '\n')
    
    category_vector = v_to_c.dot(video_vector)
    print('DISTINCT CATEGORIES:\n', category_vector.sort_values(ascending=False), '\n')
    
    dimension_vector = v_to_d.dot(video_vector) 
    print('AFFECTIVE DEMENSIONS:\n', dimension_vector, '\n')
    
    distinct_category_vector = v_to_dc.dot(video_vector)
    print('DISTINCT CATEGORIES:\n', distinct_category_vector.sort_values(ascending=False), '\n')


In [115]:
interpret_phrase("I feel sad", 3)

FREE RESPONSE TERMS:
 term
feeling sorry     0.907578
feeling scared    0.887173
feeling silly     0.872709
Name: term_doc, dtype: float64 

4.120614229648354
VIDEOS:
 Filename
0860.mp4    0.406645
0736.mp4    0.383534
1135.mp4    0.381702
2047.mp4    0.343230
1304.mp4    0.283619
dtype: float64 

DISTINCT CATEGORIES:
 Amusement                 0.350629
Horror                    0.209573
Anxiety                   0.179562
Sadness                   0.173975
Fear                      0.158420
Surprise                  0.139886
Interest                  0.133462
Awe                       0.081662
Sympathy                  0.055863
Aesthetic Appreciation    0.036633
Nostalgia                 0.029278
Admiration                0.015445
Awkwardness               0.015311
Boredom                   0.014502
Craving                   0.013381
Romance                   0.013105
Satisfaction              0.009736
Guilt                     0.006239
Pride                    -0.003866
Triumph       

In [81]:
#TAKEWAYS
# Not great
# doesn't reliably find best match
# term dataset covers a very broad range but what I need is more nuance on common feelings
# could try universal sentence encoder, but I think that'll be slow and not much better

# THINGS TO TRY NEXT
# Training a learning model to extract terms effectively
# Assuming this is a+ and moving on to how I'd use it (to give me more info on what I'll need)

# LATER NOTE
# wasn't inverting matricies, still not perfect