# Imports

In [1]:
import pandas as pd
import re
import emoji
import glob
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax



# Data Creation

In [2]:
FOLDER_NAME = "data/data_*/*.pkl"

In [3]:
df = pd.DataFrame()
for each_file in glob.glob(FOLDER_NAME):
    temp_df = pd.read_pickle(each_file)
    df = pd.concat([df, temp_df])
print(df)

                                                    0  \
0   RT @POTUS: As we celebrate the progress we’ve ...   
1   This #EarthDay, I'm happy to be meeting with P...   
2   RT @Khan__sir_patna: All of people wishes and ...   
3   RT @CapsCoalition: Biden Signs Executive Order...   
4   RT @tamannaahspeaks: Animals source their food...   
..                                                ...   
95  Did you know? The Canadian Coalition on Acid R...   
96  Today, EERE celebrates #EarthDay by making sev...   
97  RT @latestinspace: Happy Earth Day! These imag...   
98  Happy earth day I work between two constructio...   
99  RT @maxinevee: Happy Earth Day 🌳💚 https://t.co...   

                                 1  
0   Fri Apr 22 15:41:33 +0000 2022  
1   Fri Apr 22 15:41:33 +0000 2022  
2   Fri Apr 22 15:41:33 +0000 2022  
3   Fri Apr 22 15:41:33 +0000 2022  
4   Fri Apr 22 15:41:33 +0000 2022  
..                             ...  
95  Fri Apr 22 21:01:03 +0000 2022  
96  Fri Apr 22 21:0

In [4]:
df.columns = ['text', 'date']

In [5]:
df.reset_index(inplace=True)
df.drop(columns=['index', 'date'], inplace=True)

In [6]:
df.head()

Unnamed: 0,text
0,RT @POTUS: As we celebrate the progress we’ve ...
1,"This #EarthDay, I'm happy to be meeting with P..."
2,RT @Khan__sir_patna: All of people wishes and ...
3,RT @CapsCoalition: Biden Signs Executive Order...
4,RT @tamannaahspeaks: Animals source their food...


In [7]:
df.tail()

Unnamed: 0,text
100094,Did you know? The Canadian Coalition on Acid R...
100095,"Today, EERE celebrates #EarthDay by making sev..."
100096,RT @latestinspace: Happy Earth Day! These imag...
100097,Happy earth day I work between two constructio...
100098,RT @maxinevee: Happy Earth Day 🌳💚 https://t.co...


# Data Cleaning and Preprocessing

In [8]:
def clean_text(text):
    ''' This method takes in text to remove urls and website links and account tags if any'''
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)
    tag_pattern = r'(RT @([A-Za-z0-9_]+):)|(@([A-Za-z0-9_]+))' # Removes the RT @account tag: pattern as well
    text = re.sub(tag_pattern, '', text)
    return text

def get_account_tags(text):
    ''' This method extracts account tags'''
    tag_pattern = r'@([A-Za-z0-9_]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_hash_tags(text):
    ''' This method extracts hashtags'''
    tag_pattern = r'#([A-Za-z0-9]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_emoji_list(text):
    ''' This method extracts emojis'''
    dict_loc_emoji = emoji.emoji_lis(text)
    if dict_loc_emoji:
        list_emojis = [each['emoji'] for each in dict_loc_emoji]
        return list_emojis
    return None

In [9]:
df['text_cleaned'] = df.text.apply(clean_text)
df['account_tags'] = df.text.apply(get_account_tags)
df['hash_tags'] = df.text.apply(get_hash_tags)
df['emoji_lists'] = df.text.apply(get_emoji_list)

In [10]:
df.head()

Unnamed: 0,text,text_cleaned,account_tags,hash_tags,emoji_lists
0,RT @POTUS: As we celebrate the progress we’ve ...,As we celebrate the progress we’ve made this ...,[POTUS],[],
1,"This #EarthDay, I'm happy to be meeting with P...","This #EarthDay, I'm happy to be meeting with P...",[PennEnvironment],"[EarthDay, GetTheLeadOut]",
2,RT @Khan__sir_patna: All of people wishes and ...,All of people wishes and happy earth day #Ear...,[Khan__sir_patna],[EarthDay],
3,RT @CapsCoalition: Biden Signs Executive Order...,Biden Signs Executive Order on Natural Capita...,[CapsCoalition],[EarthDay],
4,RT @tamannaahspeaks: Animals source their food...,Animals source their food from soil but our s...,"[tamannaahspeaks, SadhguruJV, cpsavesoil]",[MyconnectwithSoil],


# Sentiment Analysis

In [11]:
classifier = pipeline("sentiment-analysis")

In [12]:
df['sentiment'] = df.text_cleaned.apply(lambda x: classifier(x)[0]['label'])

# Emotion Analysis

In [13]:
# import gc
# gc.collect()

In [14]:
MODEL = f"cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [15]:
def get_emotion(text):
    labels = ['Anger', 'Joy', 'Optimism', 'Sadness'] 
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    ranking = np.argmax(scores)
    return labels[ranking]

In [16]:
df['emotion'] = df.text_cleaned.apply(get_emotion)

In [17]:
df['emotion'].value_counts()

Joy         53363
Optimism    38890
Anger        4368
Sadness      3478
Name: emotion, dtype: int64

In [18]:
df.to_pickle("data/data_feature_extracted_text_cleaned.pkl")