## Libraries

In [1]:
import re

import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
import unidecode
import spacy


pd.set_option('max_colwidth', 500)

## Read data

In [2]:
train_path = '../datasets/emotion_detection_semeval2017/combined/train.csv'

In [3]:
train = pd.read_csv(train_path)

In [4]:
train.sample(frac=1).head(10)

Unnamed: 0,id,tweet,emotion,intensity
3650,10894,i live and die for mchanzo honeymoon crashing and burning the second they move in together,anger,0.479
2318,30675,All the fans wanted Man Utd at home in the next round...\nAre you cheering for Northampton or Man Utd right now?\n#lufc,joy,0.292
1808,30165,A joyous first webiversary/web mitzvah to Smithsonian's @WeiPoints!! @brianwolly @jackie_mansky @bethpylieberman @bilbo @mazeltov,joy,0.667
3494,21028,"@PanicAtTheDisco hey, y'all announced it like immediately after I asked. Nice. Thanks y'all",fear,0.25
1533,40676,Do not be discouraged by a slowing sales market. This will test your business model and pinpoint #strengths and #weaknesses.' @Ken_Dunn,sadness,0.271
3675,10919,"Why to have vanity sizes?Now sizes S,XS(evenXXS sometimes) are too big, WTF?! Dear corporate jerks, Lithuania didn't need this. #rant #angry",anger,0.708
2833,20367,@BaileyDemented @hsmitty3 ill kill u if u bully her 😤😤😤,fear,0.583
3512,21046,Not sure that men can handle a woman that's got her crap together. #independent,fear,0.229
2186,30543,lol! no mention of pak PM or even his speech on any international news channel and pakis are rejoicing as if the world stands with them,joy,0.396
1278,40421,"@DxfyingGrxvity - that were rather forlorn, scanning the witches house before resting back on Elphie. 'The Grimmerie is gone.'",sadness,0.458


In [5]:
sample_tweet = "The moment you bring her to meet your best friend and you're nervous af! 😬😆 #nervous #thefriendtest"

## Preprocessing

In [6]:
# Initialize Ekphrasis parsers
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", # Options are 'twitter' or 'english'
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=False).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

spacy_preprocessor = TextPreProcessor(
    omit=['email', 'phone', 'user', 'time', 'url', 'date', 'hashtag'],
    corrector='twitter',
    segmenter='twitter',
    tokenizer=lambda x: x.split(),
    annotate=[]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...
Reading twitter - 1grams ...
You can't omit/backoff and unpack hashtags!
 unpack_hashtags will be set to False


In [7]:
# Initialize spacy parser
nlp = spacy.load('en_core_web_lg')

In [39]:
def preprocess_raw_tweet(twt, preprocessor=text_processor, spacy_preprocessor=nlp):
    
    # Demojize
    def demojize(parsed):
        if emoji.emoji_count(' '.join(parsed)) > 0:
            emoji_parse = []
            for tok in parsed:
                if emoji.emoji_count(tok) > 0:
                    emoji_list = emoji.demojize(tok).strip(':').split('_')
                    emoji_list.remove('face')
                    emoji_parse.extend(emoji_list)
                else:
                    emoji_parse.append(tok)
        else:
            emoji_parse = parsed
        return emoji_parse
    
    def clean_accents(parsed):
        return [unidecode.unidecode(x) for x in parsed]
    
    def clean_tweet_for_spacy(twt):
        # Remove emoji, hashtags, user handles and diacritics/accents
        # and return the sentence as a string
        cleaned = re.sub(r'#\w+', ' ', twt)
        emoji_cleaned = ''
        emoji_count = 0
        for char in cleaned:
            if char in emoji.UNICODE_EMOJI:
                emoji_count += 1
                continue
            emoji_cleaned += char

        emoji_cleaned = re.sub(r'\s\s+', '', emoji_cleaned)
        return emoji_cleaned, emoji_count    
    
    preprocessed_tweet = clean_accents(demojize(preprocessor.pre_process_doc(twt)))
    spacy_raw_text, emoji_count = clean_tweet_for_spacy(twt)
    spacy_preprocessed_doc = nlp(spacy_raw_text)
    return preprocessed_tweet, spacy_preprocessed_doc, emoji_count

In [40]:
print(sample_tweet)
print(preprocess_raw_tweet(sample_tweet))

The moment you bring her to meet your best friend and you're nervous af! 😬😆 #nervous #thefriendtest
(['The', 'moment', 'you', 'bring', 'her', 'to', 'meet', 'your', 'best', 'friend', 'and', 'you', 'are', 'nervous', 'af', '!', 'grimacing', 'grinning', 'squinting', '<hashtag>', 'nervous', '</hashtag>', '<hashtag>', 'the', 'friend', 'test', '</hashtag>'], The moment you bring her to meet your best friend and you're nervous af!, 2)


## Feature engineering

In [10]:
def get_swear_word_list():
    with open('../datasets/swear_words.csv', 'r') as f:
        swear_words = f.read().split(',')

    with open('../datasets/swear_words_2.csv', 'r') as f:
        swear_words_2 = f.read().split('\n')
    swear = list(set(swear_words + swear_words_2))
    swear.remove('')
    return swear

swear = get_swear_word_list()

def get_no_of_swear_words(twt, swear=swear):
    c = 0
    lowercase_tweet = [x.lower() for x in twt]
    for t in lowercase_tweet:
        if t in swear:
            c += 1
    return twt, c

In [11]:
ekphrasis_parse, spacy_obj = preprocess_raw_tweet(sample_tweet)

In [12]:
x = spacy_obj[0]

In [32]:
n = spacy_obj[13]
b = nlp('best')
a = nlp('restless')

In [33]:
n.similarity(b)

0.20428438968784338

In [34]:
n.similarity(a)

0.5168664023232394