# Find the Top Ten Words of Recall words for each regressor
### For each story there will be 3 'Top Ten Words', One for each type of regressor
#### 1) Concatenate unique words across all participants
#### 2) For each regressor, regress out of the words the other two vectors. 
#### 3) Correlate the residual wvs with the target regressor. 
#### 4) Index the 10 highest values, and find the 10 highest words
#### 5) Put in dictionary


In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.plotly as py
import plotly
import os
import glob
import math
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import copy
from random import randrange
from sklearn.metrics import jaccard_score
import random
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression
from scipy import stats


from sklearn.decomposition import PCA #for cluster analysis
from gensim.models import KeyedVectors #for word embeddings
import matplotlib.pyplot as plt #for plotting
import os #for importing
import pickle #for loading transcripts
from scipy.stats import pearsonr 

# from _DRAFT_20200604_functions import * #includes constants and score function
from tqdm import tqdm_notebook #for progress bar

%autosave 5

Autosaving every 5 seconds


# 0. Import recalls, and uncentered story and template vectors and sums 

In [2]:
# import recalls and sums
recalls = pickle.load( open( 'fr_recalls', "rb" ) )
sums = pickle.load( open( "fr_sums", "rb" ) )

#import non-centered story, template
templates = pickle.load( open( 'template_vectors', "rb" ) )
stories = pickle.load( open( 'actual_story_vectors', "rb" ) )

In [3]:
top_ten = {}

In [4]:
wikipath = 'rolando/wiki-news-300d-1M.vec'
wv_model = KeyedVectors.load_word2vec_format(wikipath)

In [5]:
wv_dim = 300

# FastText preprocessing, based on bittlingmayer/ft_wiki_preproc.py
# Remove special characters, put spaces between all tokens
SUB = ["s/’/'/g", "s/′/'/g", "s/''/ /g", "s/'/ ' /g", 's/“/"/g', 's/”/"/g', 's/"/ /g', "s/\\./ \\. /g", "s/<br \\/>/ /g", "s/, / , /g", "s/(/ ( /g", "s/)/ ) /g", "s/\\!/ \\! /g", "s/\\?/ \\? /g", "s/\\;/ /g", "s/\\:/ /g", "s/-/ - /g", "s/=/ /g", "s/=/ /g", "s/*/ /g", "s/|/ /g", "s/«/ /g", 
       "s/…/ /g", "s/‘/ /g", "s/í/ /g", "s/ñ/ /g", "s/\x84/ /g", "s/î/ /g", "s/ó/ /g", "s/\x83/ /g", "s/ï/ /g", "s/õ/ /g",
       "s/ò/ /g", "s/,/ /g", "s/ô/ /g", "s/\x92/ /g", "s/é/ /g", "s/\x8e/ /g", "s/â\x80¦/ /g", "s/\x91/ /g", "s/\x93/ /g",
       "s/\x94/ /g", "s/ã®/ /g", "s/ã¨/ /g", "s/ã©/ /g",
       "s/\â\x80\x99/ /g", "s/â\x80\x9c/ /g", "s/â\x80\x9d/ /g", "s/â\x80\x99/ /g", "s/â\x80\x9c/ /g", "s/â\x80\x98/ /g",
       "s/â/ /g"]

def __normalize_text(s):
    for sg in SUB:
        rep = sg.replace('\\','').split('/')
        s = s.replace(rep[1], rep[2])
    s = s.replace('/',' ')
    return s

def __spaces(s):
    return ' '.join(s.split())

def __digits(s):
    return ''.join(filter(lambda c: not c.isdigit(), s))

# def preproc(s):
#     return __punctuation(__spaces(__digits(__normalize_text(s.lower()))))

def preproc(s):
    return (__spaces(__digits(__normalize_text(s.lower()))))

def __punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))

def word2vecSent(sentence, model = 'fasttext'):
    wv_dim = 300 #for glove and fasttext
    
    if model == 'glove':
        wvmodel = glove_model
    elif model == 'fasttext':
        wvmodel = wv_model
        
    words = preproc(sentence).split(' ')
    wv = np.zeros((len(words), wv_dim))
    for i in range(len(words)):
        if words[i] in wvmodel.vocab:
            wv[i,:] = wvmodel.word_vec(words[i])
    
    return words, wv

# I. 3 sets of top ten 
## A. Find unique words in recalls of each story
### 1. Concatenate words and word vectors in parallel across all participants

In [6]:
all_words = {}
all_wvs = {}

#iterate through all recalls in a story and concatente words and wvs of all stories
for key in recalls:
    words = np.zeros((0,1))
    wvs = np.zeros((0,300))
    for i in range(0,3):
        for j in range(0,len(recalls[key][i])):
            p_words, p_wvs = word2vecSent(recalls[key][i][j][0])
            # reshape p_words
            p_words = np.array(p_words)
            p_words = p_words.reshape(p_words.shape[0],-1)
            # stack p_words and p_wvs
            words = np.vstack((words, p_words))
            wvs = np.vstack((wvs, p_wvs))
    all_words[key] = words
    all_wvs[key] = wvs

### 2. Centering all words + story wvs + template wvs

In [8]:
#Concatenate all words from recall

concat = np.zeros((0,300))

for key in all_wvs:
    for i in range(0, len(all_wvs[key])):
        concat = np.vstack((concat, all_wvs[key][i]))
    


In [9]:
print(concat.shape)
for key in stories:
    concat = np.vstack((concat, stories[key]))

(37545, 300)


In [10]:
print(concat.shape)
for key in templates:
    concat = np.vstack((concat, templates[key]))
print(concat.shape)

(37561, 300)
(37569, 300)


In [11]:
centering_vec = np.mean(concat, axis = 0)

# template vectors

for key in templates:
    templates[key] = templates[key] - centering_vec
    
# recall vectors

for key in recalls:
    for i in range(0, len(all_wvs[key])):
        all_wvs[key][i] = all_wvs[key][i] - centering_vec
            
#story vectors

#make new dict with int key instead of string
int_stories = {}
for key in stories:
    int_stories[int(key)] = stories[key] - centering_vec

### 3. Find unique words and wvs

In [12]:
au_words = {}
au_wvs = {}

for key in all_words:
    # find unique wvs and their indices:
    unique_wvs, index = np.unique(all_wvs[key], axis=0, return_index=True)
    unique_words =  all_words[key][index]
    # put in dicts
    au_wvs[key] = unique_wvs
    au_words[key] = unique_words

## B. *Story* Top Ten
### A. Regress out of all the unique_wvs the loc and soc template wvs

In [30]:

resid_wvs = {}

for key in au_wvs:
    these_residuals = np.zeros((0, 300))
    loc = key%10
    soc = round(key/10)*10
    # Make the inputs of the regression
    # location template
    l_temp = templates[loc].reshape(templates[loc].shape[0],-1)
    # social template
    s_temp = templates[soc].reshape(templates[soc].shape[0],-1)
    # Concatenating inputes
    inputs = np.concatenate((l_temp, s_temp), axis = 1)
    for i in range(0,len(au_wvs[key])):
        output = np.copy(au_wvs[key][i])
        #output = output.reshape(output.shape[0],-1)
        model = LinearRegression().fit(inputs, output)
        ## Is subtracting the intercept correct?? 
        new_vec = output - np.dot(model.coef_, inputs.T)- model.intercept_
        these_residuals = np.vstack((these_residuals, new_vec))
    resid_wvs[key] = these_residuals

### C. Correlate each wv in the recall with the story wv

In [31]:
all_correls = {}

for key in resid_wvs:
    corr = np.zeros((0,1))
    this_story = stories[str(key)].reshape(stories[str(key)].shape[0], -1)
    for i in range(0, len(resid_wvs[key])):
        this_wv = resid_wvs[key][i].reshape(resid_wvs[key][i].shape[0], -1)
        this_correl = np.corrcoef(this_story.T, this_wv.T)
        # add to corr
        corr = np.vstack((corr, this_correl[0,1]))
    all_correls[key] = corr

### D. Top Ten values' indices and words

In [32]:
for key in all_correls:
    print(key)
    this_corr = all_correls[key]
    this_corr = np.ravel(this_corr)
    story_index = this_corr.argsort()[-10:][::-1]
    #print(story_index)
    # index the top words
    top_words = np.zeros((0,1))
    for i in range(0, 10):
        top_words = np.vstack((top_words, au_words[key][story_index[i]]))
    top_words = top_words.ravel()
    print(top_words)
    top_ten[key] = {'story': top_words}

11
['hadn' 'could' 'might' 'wasn' 'don' 'can' 'doesn' 'hear' 'got' 'make']
33
['could' 'doesn' 'wasn' 'can' 'isn' 'should' 'would' 'make' 'extremely'
 'knows']
44
['couldn' 'could' 'wasn' 'can' 'doesn' 'hasn' 'don' 'seeing' 'looked'
 'nodded']
23
['could' 'didn' 'wasn' 'don' 'got' 'seeing' 'looked' 'cannot' 'knows'
 'would']
12
['couldn' 'could' 'might' 'didn' 'can' 'don' 'doesn' 'slightly' 'ago'
 'should']
21
['could' 'nodded' 'wasn' 'ago' 'can' 'make' 'always' 'gladly' 'looked'
 'got']
13
['wouldn' 'could' 'might' 'wasn' 'didn' 'don' 'can' 'doesn' 'always'
 'nodded']
42
['could' 'wouldn' 'might' 'hadn' 'slightly' 'can' 'doesn' 'got' 'shouldn'
 'make']
43
['could' 'didn' 'sounded' 'got' 'feels' 'looked' 'looks' 'seeing' 'hear'
 'would']
32
['could' 'didn' 'wasn' 'seeing' 'make' 'would' 'isn' 'got' 'amount' 'want']
34
['could' 'don' 'can' 'cannot' 'should' 'would' 'during' 'wants' 'amount'
 'have']
22
['couldn' 'didn' 'hadn' 'wasn' 'onto' 'got' 'don' 'looked' 'make' 'hear']
41
['could'

## B. *Location* Top Ten
### 1. Regress out of all the unique_wvs the story and soc template wvs

In [34]:

resid_wvs = {}

for key in au_wvs:
    these_residuals = np.zeros((0, 300))
    soc = round(key/10)*10
    # Make the inputs of the regression
    # story vector
    story_vec = stories[str(key)].reshape(stories[str(key)].shape[0],-1)
    # social template
    s_temp = templates[soc].reshape(templates[soc].shape[0],-1)
    # Concatenating inputes
    inputs = np.concatenate((story_vec, s_temp), axis = 1)
    for i in range(0,len(au_wvs[key])):
        output = np.copy(au_wvs[key][i])
        #output = output.reshape(output.shape[0],-1)
        model = LinearRegression().fit(inputs, output)
        ## Is subtracting the intercept correct?? 
        new_vec = output #- np.dot(model.coef_, inputs.T)- model.intercept_
        these_residuals = np.vstack((these_residuals, new_vec))
    resid_wvs[key] = these_residuals


### 2. Correlate each wv in the recall with the story wv

In [35]:
all_correls = {}

for key in resid_wvs:
    corr = np.zeros((0,1))
    loc = key%10
    this_loc = templates[loc].reshape(templates[loc].shape[0], -1)
    for i in range(0, len(resid_wvs[key])):
        this_wv = resid_wvs[key][i].reshape(resid_wvs[key][i].shape[0], -1)
        this_correl = np.corrcoef(this_loc.T, this_wv.T)
        # add to corr
        corr = np.vstack((corr, this_correl[0,1]))
    all_correls[key] = corr


### 3. Top Ten values' indices and words

In [36]:
for key in all_correls:
    print(key)
    this_corr = all_correls[key]
    this_corr = np.ravel(this_corr)
    story_index = this_corr.argsort()[-10:][::-1]
    #print(story_index)
    # index the top words
    top_words = np.zeros((0,1))
    for i in range(0, 10):
        top_words = np.vstack((top_words, au_words[key][story_index[i]]))
    top_words = top_words.ravel()
    print(top_words)
    top_ten[key]['loc'] =  top_words

11
['restaurant' 'food' 'menus' 'menu' 'diner' 'restuarant' 'foods' 'meal'
 'waiter' 'dessert']
33
['grocery' 'store' 'groceries' 'stores' 'supermarket' 'checkout' 'shop'
 'checkouts' 'purchase' 'cashier']
44
['lecture' 'class' 'lectures' 'lecturing' 'classroom' 'assignment' 'hall'
 'lesson' 'professor' 'lecturer']
23
['grocery' 'store' 'groceries' 'supermarket' 'checkout' 'shop' 'shopping'
 'checkouts' 'grocer' 'shoppers']
12
['airport' 'airports' 'plane' 'flight' 'boarding' 'gate' 'gates'
 'security' 'departure' 'terminal']
21
['restaurant' 'food' 'menus' 'menu' 'chef' 'dishes' 'meal' 'waiter'
 'dessert' 'eating']
13
['grocery' 'store' 'groceries' 'checkout' 'shopping' 'cashier' 'items'
 'cart' 'food' 'products']
42
['airport' 'plane' 'flight' 'boarding' 'gate' 'airplane' 'security'
 'departure' 'planes' 'destination']
43
['grocery' 'store' 'groceries' 'supermarket' 'checkout' 'shop' 'shopping'
 'warehouse' 'cashier' 'items']
32
['airport' 'plane' 'flight' 'gate' 'gates' 'flights' 'a

## C. *Social* Top Ten
### 1. Regress out of all the unique_wvs the loc and soc template wvs

In [37]:
resid_wvs = {}

for key in au_wvs:
    these_residuals = np.zeros((0, 300))
    loc = key%10
    # Make the inputs of the regression
    # story vector
    story_vec = stories[str(key)].reshape(stories[str(key)].shape[0],-1)
    # location template
    l_temp = templates[loc].reshape(templates[loc].shape[0],-1)
    # Concatenating inputes
    inputs = np.concatenate((story_vec, l_temp), axis = 1)
    for i in range(0,len(au_wvs[key])):
        output = np.copy(au_wvs[key][i])
        #output = output.reshape(output.shape[0],-1)
        model = LinearRegression().fit(inputs, output)
        ## Is subtracting the intercept correct?? 
        new_vec = output - np.dot(model.coef_, inputs.T)- model.intercept_
        these_residuals = np.vstack((these_residuals, new_vec))
    resid_wvs[key] = these_residuals

### 2. Correlate each wv in the recall with the social wv

In [38]:
all_correls = {}

for key in resid_wvs:
    corr = np.zeros((0,1))
    soc = round(key/10)*10
    this_soc = templates[soc].reshape(templates[soc].shape[0], -1)
    for i in range(0, len(resid_wvs[key])):
        this_wv = resid_wvs[key][i].reshape(resid_wvs[key][i].shape[0], -1)
        this_correl = np.corrcoef(this_soc.T, this_wv.T)
        # add to corr
        corr = np.vstack((corr, this_correl[0,1]))
    all_correls[key] = corr

### D. Top Ten values' indices and words

In [39]:
for key in all_correls:
    this_corr = all_correls[key]
    this_corr = np.ravel(this_corr)
    story_index = this_corr.argsort()[-10:][::-1]
    print(story_index)
    # index the top words
    top_words = np.zeros((0,1))
    for i in range(0, 10):
        top_words = np.vstack((top_words, au_words[key][story_index[i]]))
    top_words = top_words.ravel()
    print(top_words)
    top_ten[key]['soc'] = top_words

[471 162 233 264 445 211 462 109 297 307]
['breakup' 'breaking' 'for' 'of' 'the' 'break' 'relationship' 'what'
 'considering' 'ending']
[256 310 144 273 442 108 416 472 457 471]
['deal' 'partnership' 'business' 'agreement' 'investment' 'success' 'the'
 'handshake' 'deals' 'competitor']
[376 258 248 185  94 241 219 375 280 220]
['the' 'two' 'first' 'other' 'what' 'couple' 'of' 'time' 'when' 'next']
[395 364 320 361 381 536 489 462 393 497]
['ring' 'diamond' 'proposed' 'proposing' 'proposal' 'marriage' 'diamonds'
 'the' 'planning' 'engagement']
[401 106 187 172 196  70 372 150 392 452]
['breakup' 'breaking' 'split' 'for' 'of' 'what' 'the' 'break'
 'relationship' 'departure']
[362 474 339 294 355 487 447 423 360 452]
['ring' 'wedding' 'diamond' 'proposed' 'proposal' 'marriage' 'diamonds'
 'the' 'planning' 'engagement']
[487 152 240 455 266 478 215 106 334 345]
['breakup' 'breaking' 'for' 'the' 'of' 'relationship' 'break' 'what'
 'with' 'by']
[385 268 224 302 301 187 226 290 340 200]
['the

# II. Presenting Top Ten Words

In [43]:
story_order = [11,12,13,14,21,22,23,24,31,32,33,34,41,42,43,44]

for story in story_order:
    print('\n\n',story)
    for keys in top_ten[story]:
        print(keys)
        print(top_ten[story][keys])



 11
story
['hadn' 'could' 'might' 'wasn' 'don' 'can' 'doesn' 'hear' 'got' 'make']
loc
['restaurant' 'food' 'menus' 'menu' 'diner' 'restuarant' 'foods' 'meal'
 'waiter' 'dessert']
soc
['breakup' 'breaking' 'for' 'of' 'the' 'break' 'relationship' 'what'
 'considering' 'ending']


 12
story
['couldn' 'could' 'might' 'didn' 'can' 'don' 'doesn' 'slightly' 'ago'
 'should']
loc
['airport' 'airports' 'plane' 'flight' 'boarding' 'gate' 'gates'
 'security' 'departure' 'terminal']
soc
['breakup' 'breaking' 'split' 'for' 'of' 'what' 'the' 'break'
 'relationship' 'departure']


 13
story
['wouldn' 'could' 'might' 'wasn' 'didn' 'don' 'can' 'doesn' 'always'
 'nodded']
loc
['grocery' 'store' 'groceries' 'checkout' 'shopping' 'cashier' 'items'
 'cart' 'food' 'products']
soc
['breakup' 'breaking' 'for' 'the' 'of' 'relationship' 'break' 'what'
 'with' 'by']


 14
story
['couldn' 'could' 'don' 'can' 'doesn' 'didn' 'shouldn' 'make' 'sorry'
 'hear']
loc
['lecture' 'class' 'lectures' 'teaching' 'classroom'

In [41]:
top_ten

{11: {'story': array(['hadn', 'could', 'might', 'wasn', 'don', 'can', 'doesn', 'hear',
         'got', 'make'], dtype='<U32'),
  'loc': array(['restaurant', 'food', 'menus', 'menu', 'diner', 'restuarant',
         'foods', 'meal', 'waiter', 'dessert'], dtype='<U32'),
  'soc': array(['breakup', 'breaking', 'for', 'of', 'the', 'break', 'relationship',
         'what', 'considering', 'ending'], dtype='<U32')},
 33: {'story': array(['could', 'doesn', 'wasn', 'can', 'isn', 'should', 'would', 'make',
         'extremely', 'knows'], dtype='<U32'),
  'loc': array(['grocery', 'store', 'groceries', 'stores', 'supermarket',
         'checkout', 'shop', 'checkouts', 'purchase', 'cashier'],
        dtype='<U32'),
  'soc': array(['deal', 'partnership', 'business', 'agreement', 'investment',
         'success', 'the', 'handshake', 'deals', 'competitor'], dtype='<U32')},
 44: {'story': array(['couldn', 'could', 'wasn', 'can', 'doesn', 'hasn', 'don', 'seeing',
         'looked', 'nodded'], dtype='<U32'