In [None]:
# create features for each memory. Five features for each memory
# assign to each memory with features its assigned label by the expert
# save the resulting dataframe to .csv file
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive/')

# read tf-idf files into a list
emotion_keywords = ["A", "D", "F", "H", "Sa"]
tfidf_dict = {}
for word in emotion_keywords:
    path = '/content/drive/MyDrive/tf-idf-files/' + word + "-tf-idf.csv"
    tfidf_dict[word] = pd.read_csv(path)
tfidf_dict

In [None]:
# read nrc word list
total_nrc_words = pd.read_csv("/content/drive/MyDrive/cleaned-data/clean_nrc_list.csv")
total_nrc_words

In [None]:
# organize words for each emotion into a dictionary
nrc_dict = {}
nrc_dict["A"] = total_nrc_words[total_nrc_words["Emotion"] == "anger"]
nrc_dict["D"] = total_nrc_words[total_nrc_words["Emotion"] == "disgust"]
nrc_dict["F"] = total_nrc_words[total_nrc_words["Emotion"] == "fear"]
nrc_dict["H"] = total_nrc_words[total_nrc_words["Emotion"] == "joy"]
nrc_dict["Sa"] = total_nrc_words[total_nrc_words["Emotion"] == "sadness"]
nrc_dict

In [None]:
def filter_to_tfidf(nrc, tf):
    temp = []                       # hold chosen words
    tf = list(tf["Unnamed: 0"])     # get a complete list of words in the TF-IDF table
    for w in nrc['Word']:           # loop over each word in the nrc word list for an emotion
        if w in tf[:1024]:
            temp.append(w)          # save the word from nrc if it exists in the top tf-idf list
    return temp

In [None]:
# a dictionary of chsoen words for each emtotion
chosen_words_dict = {}
for word in emotion_keywords:
    chosen_words_dict[word] = filter_to_tfidf(nrc_dict[word], tfidf_dict[word])
chosen_words_dict

In [None]:
import torch
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-mpnet-base-v2')

query_embedding = model.encode("London's population")
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district',
                                 'Hello, Tehran is the capital of Iran!',
                                  'Britain is a very large kingdom.',
                                  'London is a big metropolice'])

x = util.cos_sim(query_embedding, passage_embedding)
x.shape
print("Similarity:", x)

In [None]:
# model the chosen words & convert them to torch for faster comparison
embedded_words_list = []
for word in emotion_keywords:
    embedded_words_list.append(torch.from_numpy(model.encode(chosen_words_dict[word])))
embedded_words_list

In [None]:
isear_databank = pd.read_csv("/content/drive/MyDrive/cleaned-data/clean_isear_databank.csv")
isear_databank_size = isear_databank.shape[0]
matrix_of_similarity = []
for memory in isear_databank['SIT']:
  query_embedded_tch = torch.from_numpy(model.encode(memory))
  for emotion_words in embedded_words_list:
    matrix_of_similarity.append(torch.mean(util.cos_sim(query_embedded_tch, emotion_words)))
matrix_of_similarity

In [None]:
matrix_numpy = []
for tensor in matrix_of_similarity:
  matrix_numpy.append(tensor.numpy())

# Changing the matrix dtype to numpy
matrix_numpy = np.array(matrix_numpy)
# Converting matrix to 1-D
matrix_numpy = np.ravel(matrix_numpy)

# reshaping matrix of similarity to correspond to memories and 5 emotions
matrix_alpha = matrix_numpy.reshape((isear_databank_size,5))
matrix_alpha

array([[0.13547008, 0.13519649, 0.10703606, 0.09071749, 0.11186253],
       [0.1617297 , 0.1092312 , 0.10573095, 0.085523  , 0.11530074],
       [0.10793681, 0.08494516, 0.07652212, 0.04234904, 0.06555107],
       ...,
       [0.1192046 , 0.10722173, 0.100852  , 0.06380803, 0.1316656 ],
       [0.11271481, 0.09845766, 0.08893345, 0.06438081, 0.11722527],
       [0.13751973, 0.1456797 , 0.16216445, 0.13381398, 0.22810999]],
      dtype=float32)

In [None]:
featured_dataframe = pd.DataFrame(matrix_alpha, columns=['anger', 'disgust', 'fear', 'joy', 'sadness'])
featured_dataframe["Label"] = isear_databank["Field1"]
featured_dataframe

Unnamed: 0,anger,disgust,fear,joy,sadness,Label
0,0.135470,0.135196,0.107036,0.090717,0.111863,anger
1,0.161730,0.109231,0.105731,0.085523,0.115301,anger
2,0.107937,0.084945,0.076522,0.042349,0.065551,anger
3,0.120658,0.100243,0.115948,0.073372,0.118159,anger
4,0.140638,0.097993,0.100632,0.059320,0.092775,anger
...,...,...,...,...,...,...
5465,0.098426,0.097545,0.107664,0.094018,0.119966,sadness
5466,0.160211,0.168394,0.184216,0.110078,0.241065,sadness
5467,0.119205,0.107222,0.100852,0.063808,0.131666,sadness
5468,0.112715,0.098458,0.088933,0.064381,0.117225,sadness


In [None]:
# create the folder named "featured_dataframe" before running the following code
featured_dataframe.to_csv("/content/drive/MyDrive/featured_dataframe/featured_dataframe.csv", index = False)