## Import important libraries

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import pickle
import copy
import re
import numpy as np

#For creating directories
import os

import emoji
from collections import Counter

## Create train, dev, test dictionaries

In [2]:
def TrainTest(input_directory):
    roleDict = {}
    with open(input_directory+"train_ids.txt","rt") as fid:
        for line in fid:
            #delete all characters besides 0-9 and a-z
            line = re.sub('[^0-9a-zA-Z]+', '', line)
            roleDict[line] = 'train'
    with open(input_directory+"dev_ids.txt","rt") as fid:
        for line in fid:
            line = re.sub('[^0-9a-zA-Z]+', '', line)
            roleDict[line] = 'dev'
    with open(input_directory+"test_ids.txt","rt") as fid:
        for line in fid:
            line = re.sub('[^0-9a-zA-Z]+', '', line)
            roleDict[line] = 'test'
    return roleDict

## Create an empty emotion dictionary with all the emojis

In [3]:
#There are some new emojis there....
# code: 127998 is ♀ 
# code: 8205 is red dot

empty_emoji_count = dict()
for emoji_now in emoji.UNICODE_EMOJI:
    empty_emoji_count[emoji_now] = 0
empty_emoji_count[chr(127998)] = 0
empty_emoji_count[chr(8205)] = 0
print(empty_emoji_count)
print(len(empty_emoji_count))

{'🥇': 0, '🥈': 0, '🥉': 0, '🆎': 0, '🏧': 0, '🅰': 0, '🇦🇫': 0, '🇦🇱': 0, '🇩🇿': 0, '🇦🇸': 0, '🇦🇩': 0, '🇦🇴': 0, '🇦🇮': 0, '🇦🇶': 0, '🇦🇬': 0, '♒': 0, '🇦🇷': 0, '♈': 0, '🇦🇲': 0, '🇦🇼': 0, '🇦🇨': 0, '🇦🇺': 0, '🇦🇹': 0, '🇦🇿': 0, '🔙': 0, '🅱': 0, '🇧🇸': 0, '🇧🇭': 0, '🇧🇩': 0, '🇧🇧': 0, '🇧🇾': 0, '🇧🇪': 0, '🇧🇿': 0, '🇧🇯': 0, '🇧🇲': 0, '🇧🇹': 0, '🇧🇴': 0, '🇧🇦': 0, '🇧🇼': 0, '🇧🇻': 0, '🇧🇷': 0, '🇮🇴': 0, '🇻🇬': 0, '🇧🇳': 0, '🇧🇬': 0, '🇧🇫': 0, '🇧🇮': 0, '🆑': 0, '🆒': 0, '🇰🇭': 0, '🇨🇲': 0, '🇨🇦': 0, '🇮🇨': 0, '♋': 0, '🇨🇻': 0, '♑': 0, '🇧🇶': 0, '🇰🇾': 0, '🇨🇫': 0, '🇪🇦': 0, '🇹🇩': 0, '🇨🇱': 0, '🇨🇳': 0, '🇨🇽': 0, '🎄': 0, '🇨🇵': 0, '🇨🇨': 0, '🇨🇴': 0, '🇰🇲': 0, '🇨🇬': 0, '🇨🇩': 0, '🇨🇰': 0, '🇨🇷': 0, '🇭🇷': 0, '🇨🇺': 0, '🇨🇼': 0, '🇨🇾': 0, '🇨🇿': 0, '🇨🇮': 0, '🇩🇰': 0, '🇩🇬': 0, '🇩🇯': 0, '🇩🇲': 0, '🇩🇴': 0, '🔚': 0, '🇪🇨': 0, '🇪🇬': 0, '🇸🇻': 0, '🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f': 0, '🇬🇶': 0, '🇪🇷': 0, '🇪🇪': 0, '🇪🇹': 0, '🇪🇺': 0, '🆓': 0, '🇫🇰': 0, '🇫🇴': 0, '🇫🇯': 0, '🇫🇮': 0, '🇫🇷': 0, '🇬🇫': 0, '🇵🇫': 0, '🇹🇫': 0, '🇬🇦': 0, '🇬🇲': 0, '♊': 0, '🇬🇪'

## Define the function for loading twitter profile (i.e. load all 100 tweets from an xml)

In [4]:
def LoadProfile(input_file):    
    Profile = ET.parse(input_file)
    ProfileRoot = Profile.getroot()
    Profile_attr = ProfileRoot.attrib
    for tweet in Profile.iter('document'):        
        tweet_dict = Profile_attr.copy()
        tweet_dict.update(tweet.attrib)
        tweet_dict['data'] = tweet.text
        yield tweet_dict

## Analyze tweets from the perspective of emoji usage

In [5]:
def AnalyzeTweet(tweet):
    #First remove white spaces (this will be important when examining consecutive emojis)
    tweet = re.sub(r'[\s,\t,\n,\r]*', '', tweet) 
    #Some weird characters that looks like whitespaces (or don't show at all) but they are still important for some reason
    tweet = re.sub(chr(65039), '', tweet)     
    tweet = re.sub(chr(127997), '', tweet)
    tweet = re.sub(chr(127995), '', tweet)
    #tweet = re.sub(chr(8205), '', tweet)
    #Then initalize the variables that will be important later
    #emoji_count = copy.deepcopy(empty_emoji_count)
    emoji_count = dict()
    man_count = 0
    woman_count = 0
    just_emojis = True
    emo_multi_cons = False
    emo_multi_ncons = False
    #These are important for trying to note whether the same emoji is used consecutively
    emoji_last = False
    last_emoji = None
    consecutive_ongoing = False
    consecutive = 0
    consecutive_same = 0
    longest_consecutive_same = 0
    for i in tweet:
        if ((i in emoji.UNICODE_EMOJI) or (i == chr(127998)) or (i == chr(8205))):
        #if i in emoji.UNICODE_EMOJI:
            #print(emoji.demojize(i))
            #Check whether the last character was also an emoji, and if yes, see what kind
            if (emoji_last):                
                consecutive += 1
                emo_multi_cons = True
                #If the last emoji was the same as this emoji is....
                if (last_emoji == i):                                                         
                    if not consecutive_ongoing:
                        consecutive_same += 1
                        longest_consecutive_same_now = 1
                    consecutive_ongoing = True
                    longest_consecutive_same_now += 1   
                    if longest_consecutive_same_now > longest_consecutive_same:
                        longest_consecutive_same = longest_consecutive_same_now
                else:
                    consecutive_ongoing = False                        
            elif (sum(emoji_count.values()) > 0):
                emo_multi_ncons = True                
            emoji_last = True
            last_emoji = i
            #add one to the count of the current emoji
            oldcount = emoji_count.get(i,0)            
            newcount = oldcount + 1        
            emoji_count[i] = newcount
            #See how many male and female emojis there are here
            emoji_split = re.split(': |_', emoji.demojize(i)[1:-1])            
            if (emoji_split[0] == 'man'):
                man_count += 1
            if (emoji_split[0] == 'woman'):
                woman_count += 1
        else:   
            #if (sum(emoji_count.values())>0 and (not emo_multi_ncons)):
            #    print('A bunos: |', i, '|', ord(i))
            just_emojis = False
            emoji_last = False
            consecutive_ongoing = False        
    #print('Sum: ', sum(emoji_count.values()))
    #if (sum(emoji_count.values())>0):
    #    print(emoji_count)
    return tweet, emoji_count, consecutive_same, longest_consecutive_same, just_emojis, emo_multi_cons, emo_multi_ncons

## Analyze profile from the perspective of emoji usage

In [6]:
def AnalyzeProfile(TweetGenerator):
    emoji_count = copy.deepcopy(empty_emoji_count)
    man_count = 0
    woman_count = 0
    emo_count_freq = 0    
    emo_only_freq = 0
    emo_only_one_freq = 0
    emo_multi_cons_freq = 0
    emo_multi_ncons_freq = 0
    emo_multi_cons_same_freq = 0
    emoji_nos = np.zeros(100, dtype=np.int)
    tweet_no_now = 0
    for tweet in TweetGenerator:               
        tweet_mod, emoji_count_now, consecutive_same, longest_consecutive_same, just_emojis, emo_multi_cons, emo_multi_ncons = AnalyzeTweet(tweet['data'])
        emoji_no = sum(emoji_count_now.values())
        emoji_nos[tweet_no_now] = emoji_no
        if (emoji_no == 0 and just_emojis):
            print(tweet_mod)
            print(emoji_count.values())
        if (emoji_no > 0):
            emo_count_freq += 1/100
        if (just_emojis):
            emo_only_freq += 1/100
        if (emoji_no == 1):
            emo_only_one_freq += 1/100
        if (emo_multi_cons):
            emo_multi_cons_freq += 1/100
        if (emo_multi_ncons):
            emo_multi_ncons_freq += 1/100
        if (consecutive_same > 0):
            emo_multi_cons_same_freq += 1/100
        #print(tweet['data'], 'no. of emojis: ', emoji_no)
        tweet_no_now += 1
        counter1 = Counter(emoji_count)
        counter2 = Counter(emoji_count_now)                
        counter1.update(counter2)                
        emoji_count = dict(counter1)
    emo_count_avg = np.average(emoji_nos)
    emo_count_med = np.median(emoji_nos)
    emo_count_max = np.max(emoji_nos)    
    #if (sum(emoji_count.values())>0):
    #    print(emoji_count)
    #if (emo_multi_cons and emo_multi_ncons):
    #    print('|',tweet_mod,'|')
    return emo_count_freq, emo_count_avg, emo_count_med, emo_count_max, emo_only_freq, emo_only_one_freq, emo_multi_ncons_freq, emo_multi_cons_freq, emo_multi_cons_same_freq, emoji_count

## Main function: get twitter data contained in folder_in and analyze the emojis in its tweets

In [7]:
def ProcessFolder(root_directory, input_directory, partition_directory, output_directory):    
    #Create train, dev, test dictionaries
    roleDict = TrainTest(root_directory+'/'+partition_directory+'/')
    #Create output directory (if it does not exist yet)
    if (not os.path.isdir(root_directory+'/'+output_directory)):
        os.mkdir(root_directory+'/'+output_directory)
    #Create the feature files, and close them immediately
    text_file = open(root_directory+'/'+output_directory+'/'+"train_features.txt","wt")
    text_file.close()    
    text_file = open(root_directory+'/'+output_directory+'/'+"dev_features.txt","wt")
    text_file.close()
    text_file = open(root_directory+'/'+output_directory+'/'+"test_features.txt","wt")
    text_file.close()
    #Read labels (and file names)
    Truth = pd.read_csv(root_directory+'/'+input_directory+'/truth.txt', sep=":::", header=None, engine='python')
    #Iterate over all user names, and process the corresponding file names
    for i in range(0,Truth.shape[0]):        
        emoji_count = copy.deepcopy(empty_emoji_count)        
        #Load the tweets from the current xml to the Current_data array
        TweetGenerator = LoadProfile(root_directory+'/'+input_directory+'/'+Truth[0][i]+'.xml')
        emo_count_freq, emo_count_avg, emo_count_med, emo_count_max, emo_only_freq, emo_only_one_freq, emo_multi_ncons_freq, emo_multi_cons_freq, emo_multi_cons_same_freq, emoji_count = AnalyzeProfile(TweetGenerator)        
        #Normalize emoji frequency (if necessary)
        dict_values_now = emoji_count.values()            
        if (sum(emoji_count.values()) > 0):            
            dict_values_norm = [x / float(sum(emoji_count.values())) for x in dict_values_now]
        else:
            dict_values_norm = [float(x) for x in dict_values_now]
        with open(root_directory+'/'+output_directory+'/'+roleDict[Truth[0][i]]+"_features.txt","a") as fid:                        
            fid.write(Truth[0][i]+','+str(emo_count_freq)+','+str(emo_count_avg)+','+str(emo_count_med)+','+str(emo_count_max)+','+str(emo_only_freq)+','+str(emo_only_one_freq)+','+str(emo_multi_ncons_freq)+','+str(emo_multi_cons_freq)+','+str(emo_multi_cons_same_freq)+','+str(dict_values_norm)+','+Truth[1][i]+':'+Truth[2][i]+'\n')

In [34]:
ProcessFolder('C:/Users/gyokov-local/Documents/Courses/NeuralNets/Project/Twitter/Data','pan19-author-profiling-training-2019-01-28/en', 'en_features', 'en_emoji_features')