# Imports

In [None]:
import pandas as pd
import re
import emoji
import glob
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax

# Data Creation

In [None]:
FOLDER_NAME = "data/data_*/*.pkl"

In [None]:
df = pd.DataFrame()
for each_file in glob.glob(FOLDER_NAME):
    temp_df = pd.read_pickle(each_file)
    df = pd.concat([df, temp_df])
print(df)

In [None]:
df.columns = ['text', 'date']

In [None]:
df.reset_index(inplace=True)
df.drop(columns=['index', 'date'], inplace=True)

In [None]:
df.head()

In [None]:
df.tail()

# Data Cleaning and Preprocessing

In [None]:
def clean_text(text):
    ''' This method takes in text to remove urls and website links and account tags if any'''
    url_pattern = r'(www.|http[s]?://)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_pattern, '', text)
    tag_pattern = r'(RT @([A-Za-z0-9_]+):)|(@([A-Za-z0-9_]+))' # Removes the RT @account tag: pattern as well
    text = re.sub(tag_pattern, '', text)
    return text

def get_account_tags(text):
    ''' This method extracts account tags'''
    tag_pattern = r'@([A-Za-z0-9_]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_hash_tags(text):
    ''' This method extracts hashtags'''
    tag_pattern = r'#([A-Za-z0-9]+)'
    text = re.findall(tag_pattern, text)
    return text

def get_emoji_list(text):
    ''' This method extracts emojis'''
    dict_loc_emoji = emoji.emoji_lis(text)
    if dict_loc_emoji:
        list_emojis = [each['emoji'] for each in dict_loc_emoji]
        return list_emojis
    return None

In [None]:
df['text_cleaned'] = df.text.apply(clean_text)
df['account_tags'] = df.text.apply(get_account_tags)
df['hash_tags'] = df.text.apply(get_hash_tags)
df['emoji_lists'] = df.text.apply(get_emoji_list)

In [None]:
df.head()

# Sentiment Analysis

In [None]:
classifier = pipeline("sentiment-analysis")

In [None]:
df['sentiment'] = df.text.apply(lambda x: classifier(x)[0]['label'])

# Emotion Analysis

In [None]:
# import gc
# gc.collect()

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [None]:
def get_emotion(text):
    labels = ['anger', 'joy', 'optimism', 'sadness'] 
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    ranking = np.argmax(scores)
    return labels[ranking]

In [None]:
df['emotion'] = df.text.apply(get_emotion)

In [None]:
df['emotion'].value_counts()

In [None]:
df.to_pickle("data/data_feature_extracted.pkl")