# 1. Establish memory sentiment using sentiment analysis

In [None]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
import nltk
from scipy import stats


# set as: 
# 'baseline' for baseline rest
# 'soc' for video encoding, and
# 'rest' for consolidation
stim = 'soc'
exclude_subs = [1,6,14,18,21,31]


# CHANGE TO DIRECTORY CONTAINING PREPROCESSED FILES
src = '/Users/f0064z8/Dropbox (Dartmouth College)/DSNL Team Folder/EmpOrient_fMRI/Data/'

os.chdir('.')

# get participant recall data
txt_df = pd.read_csv(src+'processed/post_task/social_memory_freewrites.csv')
txt_df = txt_df[~txt_df['subject'].isin(exclude_subs)].reset_index(drop=True)
txt_df = txt_df.sort_values(by=['subject']).reset_index(drop=True)

# COLLATING TEXT ACROSS THE 4 PATIENT VIDEOS
subs_txt=[]
for sub in range(len(txt_df)):    

    sub_txt=''
    for patient in range (2,6): # looping through cols of the 4 patient videos
        sub_txt = sub_txt + txt_df[txt_df.columns[patient]].iloc[sub] + '\n'
    subs_txt.append(sub_txt)

txt_df['soc_txt']=subs_txt
txt_df = txt_df[['subject','soc_txt']]

print(txt_df)

# Combine text data across patients in a new df

In [3]:
stim = 'soc'
exclude_subs = [1,6,14,18,21,31]
src = '/Users/f0064z8/Dropbox (Dartmouth College)/DSNL Team Folder/EmpOrient_fMRI/Data/'
os.chdir('/Users/f0064z8/Dropbox (Dartmouth College)/DSNL Team Folder/Sid/behavioral/')

# get participant recall data
txt_df = pd.read_csv(src+'processed/post_task/social_memory_freewrites.csv')
txt_df = txt_df[~txt_df['subject'].isin(exclude_subs)].reset_index(drop=True)
txt_df = txt_df.sort_values(by=['subject']).reset_index(drop=True)

# COLLATING TEXT ACROSS THE 4 PATIENT VIDEOS
subs_txt=[]
for sub in range(len(txt_df)):    

    sub_txt=''
    for patient in range (2,6): # looping through cols of the 4 patient videos
        sub_txt = sub_txt + txt_df[txt_df.columns[patient]].iloc[sub] + '\n'
    subs_txt.append(sub_txt)

txt_df['soc_txt']=subs_txt
txt_df = txt_df[['subject','soc_txt']]

print(txt_df)

    subject                                            soc_txt
0         2  When you are young, there are some signs that ...
1         3  The diagnosis of CF, with its symptoms. \nCF g...
2         4  CF is a genetic disease, and babies are tested...
3         5  If there is a genetic trend of CF in your fami...
4         7  The sweat test checks Cl in the sweat to see i...
5         8  I remember that it has a lot to do with genes....
6        10  CF is a genetic disease that can only be contr...
7        11  This video was about the clues that show signs...
8        12  I remember the speaker talking about what test...
9        13  You can diagnose a baby through a sweat test w...
10       15  Babies are given a newborn screen (if they are...
11       16  This video was about Cystic Fibrosis Diagnosis...
12       19  I did not recall much from this video, at this...
13       20  THis video talked about how doctors test for C...
14       22  Oh man, not very much. I know in this vide

# Clean the text

In [4]:
clean_df = txt_df.copy()
# removing everything except alphabets
clean_df['soc_txt'] = clean_df['soc_txt'].str.replace("[^a-zA-Z#]", " ")
# removing short words
clean_df['soc_txt'] = clean_df['soc_txt'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
# make all text lowercase
clean_df['soc_txt'] = clean_df['soc_txt'].apply(lambda x: x.lower())

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# tokenization
tokens = clean_df['soc_txt'].apply(lambda x: x.split())
# remove stop-words
tokens = tokens.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized = []
for sub in range(len(clean_df)):
    token = ' '.join(tokens[sub])
    detokenized.append(token)
    
clean_df['soc_txt'] = detokenized

# Conduct sentiment analysis

In [6]:
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def sentiment(txt):
    tokens = nltk.word_tokenize(txt)
    score = 0
    abs_score = 0
    length = len(tokens)
    n_val = 0
    
    pos_words=[]
    neg_words=[]
    
    for token in tokens:
        
        token_score = sia.polarity_scores(token)['compound']        
        score = score + token_score
        abs_score = abs_score + np.abs(token_score)
        
        if token_score > 0:
            pos_words.append(f'{token}:{token_score}')
            n_val = n_val + 1
            
        elif token_score < 0:
            neg_words.append(f'{token}:{token_score}')
            n_val = n_val + 1
            
    return pos_words, neg_words, score, abs_score, np.float64(n_val), np.float64(length) 

sub_sent = pd.DataFrame(columns = ['pos_words','neg_words','sentiment_score','abs_score','n_val','length'])

for sub in range(len(clean_df)):
    pos_words, neg_words, score, abs_score, n_val, length = sentiment(clean_df['soc_txt'].iloc[sub])
    sub_sent = sub_sent.append(pd.DataFrame({"pos_words":[pos_words],\
                                             "neg_words":[neg_words],\
                                             "sentiment_score": [score],\
                                             "abs_score": [abs_score],\
                                             "n_val": [n_val],\
                                             "length": [length]},\
                                              index=[sub]))

soc_sent_df = txt_df.copy()
soc_sent_df['pos_words'] = sub_sent['pos_words']
soc_sent_df['neg_words'] = sub_sent['neg_words']
soc_sent_df['sentiment_score'] = sub_sent['sentiment_score']
soc_sent_df['abs_score'] = sub_sent['abs_score']
soc_sent_df['n_val'] = sub_sent['n_val']
soc_sent_df['length'] = sub_sent['length']
soc_sent_df.to_csv(f'./{stim}_sent_all_txt.csv', index=False)

soc_sent_df = pd.read_csv(f'./{stim}_sent_all_txt.csv')
soc_sent_df = soc_sent_df[~soc_sent_df['subject'].isin(exclude_subs)].reset_index(drop=True)

def top_words(words):
    
    words = [w for w in words if w.isalnum() or w in [',', '.', '-', ':', '(', ')']]
    words = "".join(words)
    words = list(words.split(","))

    dict_words = {}
    
    print(words)
    
    for word in words:
        dict_words[np.abs(float(word.split(':')[1]))] = word.split(':')[0].strip()
    
    dict_words = dict(sorted(dict_words.items()))
    
    return list(dict_words.values())

for sub in range(len(soc_sent_df)):
    soc_sent_df.at[sub,'pos_words'] = list(reversed(top_words(soc_sent_df['pos_words'].loc[sub])))
    soc_sent_df.at[sub,'neg_words'] = top_words(soc_sent_df['neg_words'].loc[sub])

    
soc_sent_df[['pos_words','neg_words','sentiment_score']].head()
soc_sent_df.to_csv(f'./{stim}_sent_all_txt.csv', index=False)


['like:0.3612', 'well:0.2732', 'like:0.3612']
['failure:-0.5106', 'trouble:-0.4019', 'hard:-0.1027']
['']


IndexError: list index out of range

# Merge all behavioral data

In [9]:
# def filter_subs(df):
#     df = df[~df['subject'].isin(exclude_subs)].reset_index(drop=True)
#     df = df.sort_values(by=['subject']).reset_index(drop=True)
#     return df

# state = pd.read_csv(src+'processed/task/empathy_data.csv')
# state = state.rename(columns={"socialVid_emp":"state_emp", "group":"order"})
# state = filter_subs(state)
# state = state[['subject','state_emp','order']]

# loneliness = pd.read_csv(src+'processed/trait/loneliness_data.csv')
# loneliness = loneliness.rename(columns={"loneliness_score":"loneliness"})
# loneliness = filter_subs(loneliness)
# loneliness = loneliness[['subject',"loneliness"]]

# sent = pd.read_csv('./soc_sent.csv')
# sent = filter_subs(sent)

# facts = pd.read_csv(src+'raw/post_task/facts.csv')
# facts = filter_subs(facts)
# facts = facts[['subject','facts']]

# merged_df = pd.concat([state,loneliness,sent,facts], axis=1)
# merged_df.to_csv('./behavioral_data_tmp.csv', index=False)
# df_corr = merged_df.corr().round(2)
# df_corr.insert(0, 'cols', df_corr.columns)
# df_corr.to_csv('./behavioral_data_corr_tmp.csv', index=False)