## Libraries

In [136]:
import re
import string

import numpy as np
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
import unidecode
import spacy


pd.set_option('max_colwidth', 500)
stop_words = spacy.lang.en.stop_words.STOP_WORDS

## Read data

In [2]:
train_path = '../datasets/emotion_detection_semeval2017/combined/train.csv'

In [3]:
train = pd.read_csv(train_path)

In [4]:
train.sample(frac=1).head(10)

Unnamed: 0,id,tweet,emotion,intensity
3650,10894,i live and die for mchanzo honeymoon crashing and burning the second they move in together,anger,0.479
2318,30675,All the fans wanted Man Utd at home in the next round...\nAre you cheering for Northampton or Man Utd right now?\n#lufc,joy,0.292
1808,30165,A joyous first webiversary/web mitzvah to Smithsonian's @WeiPoints!! @brianwolly @jackie_mansky @bethpylieberman @bilbo @mazeltov,joy,0.667
3494,21028,"@PanicAtTheDisco hey, y'all announced it like immediately after I asked. Nice. Thanks y'all",fear,0.25
1533,40676,Do not be discouraged by a slowing sales market. This will test your business model and pinpoint #strengths and #weaknesses.' @Ken_Dunn,sadness,0.271
3675,10919,"Why to have vanity sizes?Now sizes S,XS(evenXXS sometimes) are too big, WTF?! Dear corporate jerks, Lithuania didn't need this. #rant #angry",anger,0.708
2833,20367,@BaileyDemented @hsmitty3 ill kill u if u bully her 😤😤😤,fear,0.583
3512,21046,Not sure that men can handle a woman that's got her crap together. #independent,fear,0.229
2186,30543,lol! no mention of pak PM or even his speech on any international news channel and pakis are rejoicing as if the world stands with them,joy,0.396
1278,40421,"@DxfyingGrxvity - that were rather forlorn, scanning the witches house before resting back on Elphie. 'The Grimmerie is gone.'",sadness,0.458


In [5]:
sample_tweet = "The moment you bring her to meet your best friend and you're nervous af! 😬😆 #nervous #thefriendtest"

## Preprocessing

In [6]:
# Initialize Ekphrasis parsers
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", # Options are 'twitter' or 'english'
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

spacy_preprocessor = TextPreProcessor(
    omit=['email', 'phone', 'user', 'time', 'url', 'date', 'hashtag'],
    corrector='twitter',
    segmenter='twitter',
    tokenizer=lambda x: x.split(),
    annotate=[]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...
Reading twitter - 1grams ...
You can't omit/backoff and unpack hashtags!
 unpack_hashtags will be set to False


In [7]:
# Initialize spacy parser
nlp = spacy.load('en_core_web_lg')

In [39]:
def preprocess_raw_tweet(twt, preprocessor=text_processor, spacy_preprocessor=nlp):
    
    # Demojize
    def demojize(parsed):
        if emoji.emoji_count(' '.join(parsed)) > 0:
            emoji_parse = []
            for tok in parsed:
                if emoji.emoji_count(tok) > 0:
                    emoji_list = emoji.demojize(tok).strip(':').split('_')
                    emoji_list.remove('face')
                    emoji_parse.extend(emoji_list)
                else:
                    emoji_parse.append(tok)
        else:
            emoji_parse = parsed
        return emoji_parse
    
    def clean_accents(parsed):
        return [unidecode.unidecode(x) for x in parsed]
    
    def clean_tweet_for_spacy(twt):
        # Remove emoji, hashtags, user handles and diacritics/accents
        # and return the sentence as a string
        cleaned = re.sub(r'#\w+', ' ', twt)
        emoji_cleaned = ''
        emoji_count = 0
        for char in cleaned:
            if char in emoji.UNICODE_EMOJI:
                emoji_count += 1
                continue
            emoji_cleaned += char

        emoji_cleaned = re.sub(r'\s\s+', '', emoji_cleaned)
        return emoji_cleaned, emoji_count    
    
    preprocessed_tweet = clean_accents(demojize(preprocessor.pre_process_doc(twt)))
    spacy_raw_text, emoji_count = clean_tweet_for_spacy(twt)
    spacy_preprocessed_doc = nlp(spacy_raw_text)
    return preprocessed_tweet, spacy_preprocessed_doc, emoji_count

In [40]:
print(sample_tweet)
print(preprocess_raw_tweet(sample_tweet))

The moment you bring her to meet your best friend and you're nervous af! 😬😆 #nervous #thefriendtest
(['The', 'moment', 'you', 'bring', 'her', 'to', 'meet', 'your', 'best', 'friend', 'and', 'you', 'are', 'nervous', 'af', '!', 'grimacing', 'grinning', 'squinting', '<hashtag>', 'nervous', '</hashtag>', '<hashtag>', 'the', 'friend', 'test', '</hashtag>'], The moment you bring her to meet your best friend and you're nervous af!, 2)


## Feature engineering

In [42]:
ekphrasis_parse, spacy_obj, emoji_count = preprocess_raw_tweet(sample_tweet)

In [58]:
spacy_obj

The moment you bring her to meet your best friend and you're nervous af!

In [62]:
# No. of tokens
def get_no_of_tokens(spacy_obj):
    return len(spacy_obj)

# Average token length
def avg_token_length(spacy_obj):
    len_list = [len(x) for x in spacy_obj]
    return float(np.mean(np.array(len_list)))
    
# Upper case token ratio
def upper_case_tokens(spacy_obj):
    return len([x for x in spacy_obj if x.is_upper])/len(spacy_obj)

# Title case token ratio
def title_case_tokens(spacy_obj):
    return len([x for x in spacy_obj if x.is_title])/len(spacy_obj)

# Get exclamation mark counts
def exclamation_mark_count(spacy_obj):
    return len([x for x in spacy_obj if x.text == '!'])

# Get question mark counts
def question_mark_count(spacy_obj):
    return len([x for x in spacy_obj if x.text == '?'])

# Get quote mark counts
def quote_mark_count(spacy_obj):
    return len([x for x in spacy_obj if x.text == '"' or x.text == "'"])

In [167]:
# Extracting 'n' most common tokens for each emotion
def get_n_most_valuable_tokens(train):
    
    n_valuable = (
        train
        [['tweet', 'emotion', 'intensity']]
        .assign(intensity = lambda x: 2*x['intensity'] - 1)
        .assign(tweet = lambda x: x['tweet'].str.split()) # Split on whitespace
        .explode('tweet')
        .assign(tweet = lambda x: x['tweet'].str.replace('[^\w\s]','').str.strip().str.lower()) # Remove punctuation, strip, lower
        .loc[lambda x: ~x['tweet'].isin(stop_words)]  # Remove stop words
        .groupby(by=['tweet', 'emotion'])
        .agg({'intensity': 'mean', 'emotion': 'count'})
        .rename(columns={'emotion': 'word_count', 'intensity': 'avg_intensity'})
        .reset_index()
        .assign(abs_intensity = lambda x: np.abs(x['avg_intensity']))
        .sort_values(by=['abs_intensity'], ascending=False)
        .loc[lambda x: x['word_count'] > 5]
        .loc[lambda x: x['abs_intensity'] > 1e-3]
        .loc[lambda x: x['tweet'] != '']
        
    )
    return n_valuable

def get_n_most_valuable_token_score(spacy_obj, emotion, n_valuable_data):
    score = 0
    for tok in spacy_obj:
        try:
            score += n_valuable_data.loc[(n_valuable_data['tweet'] == tok.text) & (n_valuable_data['emotion'] == emotion), 'avg_intensity'].iloc[0]
        except IndexError:
            pass
    return score
    
n_valuable_data = get_n_most_valuable_tokens(train)

get_n_most_valuable_token_score(nlp(sample_tweet), 'fear', n_valuable_data)

1.0325666666666669

In [161]:
# Swear words
def get_swear_word_list():
    with open('../datasets/swear_words.csv', 'r') as f:
        swear_words = f.read().split(',')

    with open('../datasets/swear_words_2.csv', 'r') as f:
        swear_words_2 = f.read().split('\n')
    swear = list(set(swear_words + swear_words_2))
    swear.remove('')
    return swear

swear = get_swear_word_list()

def get_no_of_swear_words(spacy_obj, swear=swear):
    c = 0
    lowercase_tweet = [x.text.lower() for x in spacy_obj]
    for t in lowercase_tweet:
        if t in swear:
            c += 1
    return c

In [184]:
nlp('ad')[0].is_alpha

True

In [190]:
# Embedding features

def get_embedding_features(spacy_obj):
    joy_vec = nlp('joy')[0]
    sad_vec = nlp('sad')[0]
    anger_vec = nlp('anger')[0]
    fear_vec = nlp('fear')[0]
    
    embed_words = [x for x in spacy_obj if x not in stop_words and x.is_alpha]
    
    avg_joy_similarity = float(np.mean([x.similarity(joy_vec) for x in embed_words]))
    avg_sad_similarity = float(np.mean([x.similarity(sad_vec) for x in embed_words]))
    avg_anger_similarity = float(np.mean([x.similarity(anger_vec) for x in embed_words]))
    avg_fear_similarity = float(np.mean([x.similarity(fear_vec) for x in embed_words]))
    
    max_joy_similarity = float(np.max([x.similarity(joy_vec) for x in embed_words]))
    max_sad_similarity = float(np.max([x.similarity(sad_vec) for x in embed_words]))
    max_anger_similarity = float(np.max([x.similarity(anger_vec) for x in embed_words]))
    max_fear_similarity = float(np.max([x.similarity(fear_vec) for x in embed_words]))
    
    return [avg_joy_similarity, avg_sad_similarity, avg_anger_similarity, avg_fear_similarity,
            max_joy_similarity, max_sad_similarity, max_anger_similarity, max_fear_similarity]

In [191]:
get_embedding_features(spacy_obj)

[0.3634510636329651,
 0.3534475862979889,
 0.3017246723175049,
 0.38022226095199585,
 0.570914626121521,
 0.5941110253334045,
 0.434806227684021,
 0.5261985659599304]