## Header

In [1]:
import pandas as pd

pd.set_option('max_colwidth', 500)

In [2]:
train_path = '../datasets/emotion_detection_semeval2017/combined/train.csv'

In [3]:
train = pd.read_csv(train_path)

In [4]:
train.sample(frac=1).head(20)

Unnamed: 0,id,tweet,emotion,intensity
388,10388,Imagine the twitter fume if Corbyn loses the election and then Smith leads Labour to a worse result than suggested under Corbyn.. Imagine??,anger,0.5
1589,40732,I can't mourn Kid Cudi cause we have Travis Scott...,sadness,0.229
3709,40798,@kayleighmcenany @DonaldJTrumpJr Is that really all you can offer for those who sacrifice daily to keep you safe...? @kayleighmcenany #sad,sadness,0.625
3121,20655,if anyone spoils any of my fucking shows I will haunt you in the afterlife so help me god,fear,0.451
859,40002,Feeling worthless as always,sadness,0.958
3346,20880,@Jchawes What is your favorite #horror movie? #ghosthunters,fear,0.342
2231,30588,"The point of living, and being an optimist, is to be foolish enough to believe the best is yet to come' - Peter Ustinov #optimism #quote",joy,0.36
59,10059,@JrDingy96 Ikr people still got a grudge against him for no reason like wtf?!,anger,0.75
782,10782,@shopgreenwich #ldf16 what shall we do this weekend? #spraypainting in #greenwichmarket with @SNUB23 #core246 #lilylou #fret &amp; #benoakley,anger,0.271
3191,20725,"@mdthib This is so lovely! Or I am frightened, not sure which! But, wow!",fear,0.417


## Ekphrasis parsing

In [6]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
# ]

# for s in sentences:
#     print(" ".join(text_processor.pre_process_doc(s)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
<allcaps> cant wait </allcaps> for the new season of <hashtag> twin peaks </hashtag> ＼(^o^)／ ! <repeated> <hashtag> david lynch </hashtag> <hashtag> tv series </hashtag> <happy>
i saw the new <hashtag> john doe </hashtag> movie and it sucks <elongated> ! <repeated> <allcaps> waisted </allcaps> <money> . <repeated> <hashtag> bad movies </hashtag> <annoyed>
<user> : can not wait for the <date> <hashtag> sentiment </hashtag> talks ! <allcaps> yay <elongated> </allcaps> ! <repeated> <laugh> <url>


In [7]:
for a in train.head().tweet.tolist():
    print(a, '\n', text_processor.pre_process_doc(a), '\n\n\n')

How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ## 
 ['how', 'the', 'fu*k', '<censored>', '!', 'who', 'the', 'heck', '!', 'moved', 'my', 'fridge', '.', '!', '<repeated>', 'should', 'i', 'knock', 'the', 'landlord', 'door', '.', '<hashtag>', 'angry', '</hashtag>', '<hashtag>', 'mad', '</hashtag>', '#', '#'] 



So my Indian Uber driver just called someone the N word. If I wasn't in a moving vehicle I'd have jumped out #disgusted  
 ['so', 'my', 'indian', 'uber', 'driver', 'just', 'called', 'someone', 'the', 'n', 'word', '.', 'if', 'i', 'wasn', "'", 't', 'in', 'a', 'moving', 'vehicle', 'i', "'", 'd', 'have', 'jumped', 'out', '<hashtag>', 'disgusted', '</hashtag>'] 



@DPD_UK I asked for my parcel to be delivered to a pick up store not my address #fuming #poorcustomerservice 
 ['<user>', 'i', 'asked', 'for', 'my', 'parcel', 'to', 'be', 'delivered', 'to', 'a', 'pick', 'up', 'store', 'not', 'my', 'address', '<hashtag>', 'fuming', '</hashtag>', '