In [1]:
%%capture

import pandas as pd
import re
import pickle
import csv

In [2]:
%%capture

!pip install ekphrasis
from ekphrasis.classes.segmenter import Segmenter

# to leverage word statistics from Twitter
seg_tw = Segmenter(corpus = "twitter")

In [3]:
%%capture

!pip install tweet-preprocessor
import preprocessor as tweet_proc

In [4]:
%%capture
# Reference: https://github.com/NeelShah18/emot

!pip install emot --upgrade
import emot

emot_obj = emot.core.emot()

In [5]:
def make_list(proc_obj):
    if proc_obj == None:
        return []

    store = []
    for unit in proc_obj:
        store.append(unit.match)
    
    return store

def emotext(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",", "").replace(":", "").split()))
    return text

In [6]:
# Initializing Lists

datapoints_count = 0

tweets = []
cleaned_tweet_texts = []
tokenized_tweets = []

hashtags = []
smileys = []
emojis = []

urls = []

mentions = []

numbers = []

reserveds = []

task_1_labels = []
task_2_labels = []

tweet_ids = []
hasoc_ID = []

In [7]:
def strip_list(listie):
    stripped = []
    
    for item in listie:
        stripped.append(item.strip())
    return stripped

In [8]:
file_name = "./english.csv"

file = open(file_name, 'r+', encoding = 'utf-8')

file_reader = csv.reader(file, delimiter = ',')

see_index = True

for line in file_reader:
    assert len(line) == 5
    
    if see_index == True:
        see_index = False
        continue

    datapoints_count += 1
    
    tweet_ids.append(line[0])
    task_1_labels.append(line[2])
    task_2_labels.append(line[3])
    hasoc_ID.append(line[4])
    tweets.append(line[1].replace('\n', ' '))

    parse_obj = tweet_proc.parse(line[1].replace('\n', ' '))
    
    tokenized_tweets.append(tweet_proc.tokenize(line[1].replace('\n', ' ')))
    
    cleaned_tweet_texts.append(tweet_proc.clean(line[1].replace('\n', ' ')))
    
    hashtags.append(strip_list(make_list(parse_obj.hashtags)))
    smileys.append(strip_list(make_list(parse_obj.smileys)))
    emojis.append(strip_list(make_list(parse_obj.emojis)))
    
    urls.append(strip_list(make_list(parse_obj.urls)))
    
    mentions.append(strip_list(make_list(parse_obj.mentions)))
    
    numbers.append(strip_list(make_list(parse_obj.numbers)))
    
    reserveds.append(strip_list(make_list(parse_obj.reserved)))

print("Number of Datapoints: " + str(datapoints_count))

Number of Datapoints: 3708


In [9]:
# Viewing Created Dataset
display_size = 3
start = 100

print("Tweets:")
print(tweets[start: start + display_size], end = '\n\n')

print("Cleaned Texts:")
print(cleaned_tweet_texts[start: start + display_size], end = '\n\n')

print("Hashtags:")
print(hashtags[start: start + display_size], end = '\n\n')

print("Smileys:")
print(smileys[start: start + display_size], end = '\n\n')

print("Emojis:")
print(emojis[start: start + display_size], end = '\n\n')

print("Urls:")
print(urls[start: start + display_size], end = '\n\n')

print("Mentions:")
print(mentions[start: start + display_size], end = '\n\n')

print("Numbers:")
print(numbers[start: start + display_size], end = '\n\n')

print("Reserved Words:")
print(reserveds[start: start + display_size], end = '\n\n')

print("Task Labels:")
print(task_1_labels[start: start + display_size])
print(task_2_labels[start: start + display_size])

Tweets:
['RT @laurabranigan: All READY for @StLouisBlues Game 5!!🙂 GO BLUES!!!🙂💙🎶💛🏒 ~ Kathy Golik, Other Half Entertainment #LGB #PlayGloria #LauraBr…', 'RT @shannon49170750: Me staring at my carmex only after applying it 3 minutes ago   me: Don’t do it  Don’t do it  Don’t do it  Don’t do it…', 'someone said I look like max theiriot and now no one can tell me shit about my looks. best compliment ever.']

Cleaned Texts:
[': All READY for Game !! GO BLUES!!! ~ Kathy Golik, Other Half Entertainment', ': Me staring at my carmex only after applying it minutes ago me: Dont do it Dont do it Dont do it Dont do it', 'someone said I look like max theiriot and now no one can tell me shit about my looks. best compliment ever.']

Hashtags:
[['#LGB', '#PlayGloria', '#LauraBr'], [], []]

Smileys:
[[], [], []]

Emojis:
[['🙂', '🙂', '💙', '🎶', '💛', '🏒'], [], []]

Urls:
[[], [], []]

Mentions:
[['@laurabranigan', '@StLouisBlues'], ['@shannon49170750'], []]

Numbers:
[['5'], ['3'], []]

Reserved Words:
[['

### Example

#### Tweet Text
'RT @jeonggukpics: Don’t disturb please, he is enjoying his snacks while making those little dance 😭😂😂😭💜  #BBMAsTopSocial BTS #JUNGKOOK #정국…'

#### Clean Text
': Dont disturb please, he is enjoying his snacks while making those little dance BTS'

#### Emojis
'['😭', '😂', '😂', '😭', '💜']'

#### Hashtags
''#BBMAsTopSocial', '#JUNGKOOK', '#정국''

In [10]:
# Generating Emoji Texts

def get_meaning(item):
    
    fields = emot_obj.emoji(item)
    
    if len(fields['mean']) == 0:
        return ''
    
    return fields['mean'][0].strip(':')

# Testing
print('get_meaning("😂"): ' + str(get_meaning("😂")))

get_meaning("😂"): face_with_tears_of_joy


In [11]:
emoji_texts = []

for emo_list in emojis:
    texts = []
    
    for emoji in emo_list:
        text = get_meaning(emoji)
        texts.append(text.replace('_', ' '))
    
    emoji_texts.append(texts)
    
print("Emoji Descriptions:")
print(emoji_texts[0: 5])

Emoji Descriptions:
[['face with tears of joy', 'face with tears of joy'], [], [], ['face with tears of joy'], []]


In [12]:
# Segmenting Hashtags

segmented_hashtags = []

for hashset in hashtags:
    segmented_set = []
    for tag in hashset:
        word = tag[1: ]
        # removing the hash symbol
        segmented_set.append(seg_tw.segment(word))
    segmented_hashtags.append(segmented_set)

print("Segmented Hashtags: ")
print(segmented_hashtags[start: start + display_size])

Segmented Hashtags: 
[['lgb', 'play gloria', 'laura br'], [], []]


In [13]:
import json
name = 'en.json'
data_dict = {}

data_dict['tweet_id'] = tweet_ids
data_dict['task_1'] = task_1_labels
data_dict['task_2'] = task_2_labels
data_dict['hasoc_id'] = hasoc_ID
data_dict['full_tweet'] = tweets
data_dict['tweet_cleaned_text'] = cleaned_tweet_texts
data_dict['hashtags'] = hashtags
data_dict['smiley'] = smileys
data_dict['emoji'] = emojis
data_dict['url'] = urls
data_dict['mentions'] = mentions
data_dict['numerals'] = numbers
data_dict['reserved_word'] = reserveds
data_dict['emotext'] = emoji_texts
data_dict['segmented_hash'] = segmented_hashtags

with open(name, 'w+') as f:
    json.dump(data_dict, f)

In [14]:
# That's it