In [2]:
import re
import pandas as pd
from string import punctuation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def main():
    rleaves = pd.read_csv('rleaves.csv', encoding='utf-8')
    rleaves = rleaves['raw']
    rleaves = rleaves.apply(cleanup)
    
    tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
    model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-emotion")
    
    emotions = rleaves.apply(get_emotion)
    emotions = emotions.str.replace('<pad> |</s>', '')
    
    rleaves.name = 'post'
    emotions.name = 'emotion'
    rleaves_emotion = pd.concat([rleaves, emotions], axis=1)
    rleaves_emotion.to_csv('rleaves_emotion.csv', index=False)

def cleanup(text):
    text = text.lower() # lowers the corpus
    text = re.sub('http\S+', ' ', str(text)) # removes any url
    text = re.sub('n\'t\s', ' not ', str(text)) # change apostrophe
    text = re.sub('-(?<!\d)', ' ', str(text)) # removing hyphens from numbers
    my_punctuation = punctuation.replace(".", "") # removes all punctuation except period
    text = text.translate(str.maketrans('', '', my_punctuation))
    text = re.sub('’|“|”|\.{2,}', '', str(text))
    text = re.sub('x200b', ' ', str(text)) # removing zero-width space characters
    return ' '.join([token for token in text.split()]) # removes trailing whitespaces

def get_emotion(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output = model.generate(input_ids=input_ids)
    dec = [tokenizer.decode(ids) for ids in output]
    label = dec[0]
    return label

if __name__ == "__main__":
    main()

In [14]:
# Creating for features 
actions = pd.DataFrame(rleaves['raw'])
actions['num'] = np.where(actions['raw'].str.contains('\d+'), True, False) # looks for all numbers
actions['period'] = np.where(actions['raw'].str.contains('\s*days* |\s*months* |\s*weeks* |\s*ye?a?rs* '), True, False) # looks for time
actions = actions.loc[(actions['num'] == True) & (actions['period'] == True)] # filter only those rows
actions['day'] = np.where(actions['raw'].str.contains('\d+\s*day[s\s]|\s*day\s*\d+'), True, False) # indicates whether number is day or not
actions['week'] = np.where(actions['raw'].str.contains('\d+\s*week[s\s]|\s*week\s*\d+'), True, False) # indicates whether number is week or not
actions['month'] = np.where(actions['raw'].str.contains('\d+\s*month[s\s]|\s*month\s*\d+'), True, False) # indicates whether number is month or not
actions['year'] = np.where(actions['raw'].str.contains('\d+\s*ye?a?r[s\s]*'), True, False) # indicates whether number is year or not
actions['nums'] = actions['raw'].str.findall('\d+') # strips all the number
actions['nums'] = [[int(n) for n in sub] for sub in actions['nums']] # convert all numbers to int
actions['nums'] = actions['nums'].apply(lambda x: min(x)) # keep only the smallest numbers
actions = actions.loc[(actions['nums'] > 0) & (actions['nums'] < 800)] # remove outliers
actions[['day', 'week', 'month', 'year']] = actions[['day', 'week', 'month', 'year']].cumsum(axis=1).cumsum(axis=1) == 1 # keeping only first trues
actions.drop(columns=['num', 'period'], inplace=True) # drop columns

In [15]:
# a function to find ****
def find_actions(text):
    matcher = Matcher(sp.vocab)
    pattern = [{'DEP': 'amod'},
                {'POS': 'NOUN'}]
    matcher.add("find_actions", None, pattern)
    doc = sp(''.join(text))
    matches = matcher(doc)
    acts = []
    for match_id, start, end in matches:
        span = doc[start:end]
        acts.append(span.text) 
    return acts

# apply function
rleaves['actions'] = rleaves['raw'].apply(find_actions)

In [16]:
rleaves['actions'][0]

['tough sense',
 'clear mind',
 'easy place',
 'pass thinking',
 'crazy tangent',
 'hard mind',
 'crazy tangent',
 'meditate experience',
 'crazy tangent',
 'long time',
 'ambitious starting',
 'comfortable position',
 'black room',
 'mental note',
 'obsessed work',
 'second week',
 'actual sensation',
 'moving nose',
 'repetitive minute',
 'repetitive idea',
 'chatty thought',
 'peaceful repetition',
 'repetitive place',
 'frustrated time',
 'peaceful repetition',
 'peaceful repetition',
 'inner thought',
 'few time',
 'inner dialog',
 'automatic influence']

In [17]:
b = pd.DataFrame(rleaves['actions'][1], columns=['actions'])
b['polarity'] = b['actions'].apply(lambda x: TextBlob(x).polarity)
b['subjective'] = b['actions'].apply(lambda x: TextBlob(x).subjectivity)

In [20]:
rleaves['author'].nunique()

805

In [1]:
from streamlit_disqus import st_disqus

In [2]:
st_disqus("streamlit-disqus-demo")