## Import important libraries

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import spacy
spacy_nlp = spacy.load('en_core_web_sm')
from wordsegment import load, segment
load()
from string import punctuation
import re
import codecs
from html.parser import HTMLParser

#For creating directories
import os

#For emojis
import emoji


## Define the function for loading twitter profile (i.e. load all 100 tweets from an xml)

In [3]:
def LoadProfile(input_file):    
    Profile = ET.parse(input_file)
    ProfileRoot = Profile.getroot()
    Profile_attr = ProfileRoot.attrib
    for tweet in Profile.iter('document'):        
        tweet_dict = Profile_attr.copy()
        tweet_dict.update(tweet.attrib)
        tweet_dict['data'] = tweet.text
        yield tweet_dict

## Remove links from the tweet

In [4]:
def RemoveURL(tweet):
    tweet = re.sub('https?:\/\/.[^\s]*', '', tweet)
    #Some URLs still start with www.
    tweet = re.sub('www\.[^\s]*', '', tweet)
    return tweet

## Remove Emojis

In [5]:
#It would not be necessary to have these functions as standalone,
#But I figure that if we do it like so, we can easily amend them with feature extraction
def RemoveEmoji(tweet):
    tweet = ''.join([i for i in tweet if i not in emoji.UNICODE_EMOJI]) 
    return tweet

## Remove stupid character errors (&amp; and whatever else I find)

In [6]:
def RemoveThrash(tweet):
    tweet = re.sub('&amp;', '&', tweet)
    tweet = re.sub('&#39;', '\'', tweet)
    tweet = re.sub('&quot;', '"', tweet)
    tweet = re.sub('…', ' ', tweet)
    tweet = re.sub('“', '"', tweet)
    tweet = re.sub('”', '"', tweet)
    #There is apparently a character that looks very much like an apostrophe, and is used like an apostrophe, but it is not an apostrophe
    tweet = re.sub('’', '\'', tweet)
    return tweet

## Remove @ symbols from mentions

In [7]:
#This is handled separately from the punctuation, so we can also try and store the mentions
def RemoveAt(tweet):
    tweet = re.sub('@', '', tweet)
    return tweet

## Remove hashtag symbols, and split the hashtags into words

In [8]:
def RemoveHash(tweet):
    #First remove # characters that are followed by whitespaces (they are not true whitespaces)    
    tweet = re.sub('#[\s]+', ' ', tweet)    
    #then replace hashtags with tokenized versions
    while('#' in tweet):
        #print(tweet)
        hashtag = re.search('#[^\s]*', tweet).group(0)                
        segmented_hashtag = segment(hashtag)        
        hashtag_part = ' '.join(segmented_hashtag)        
        tweet = re.sub('#[^\s]*', hashtag_part, tweet, 1)        
    return tweet

## Remove punctuation

In [9]:
def RemovePunctuation(tweet):
    #Before removing ', let us try to change every '0 to 200 and every 'x to 19x
    tweet = re.sub(r'\'([0-1])', r'20\1', tweet)
    tweet = re.sub(r'\'([2-9])', r'19\1', tweet)
    tweet = ''.join([i for i in tweet if i not in punctuation])
    #After removing the punctuation, we may have to partition again (so for example Sales/Development does not become Sales Development)
    #load()
    #tweet = segment(tweet)    
    #print(tweet)
    return tweet

## Cleaning data

In [10]:
def CleanData(tweet):    
    #print('NOISY:')
    #print(tweet)
    tweet = RemoveURL(tweet)
    tweet = RemoveEmoji(tweet)
    tweet = RemoveThrash(tweet)
    tweet = RemoveAt(tweet)
    tweet = RemoveHash(tweet)
    tweet = RemovePunctuation(tweet)
    #In the end replace all whitespaces with just one space
    tweet = re.sub('[\s]+', ' ', tweet)
    #print('CLEAN:')    
    #print(tweet)
    
    return tweet

## Main function: get twitter data contained in folder_in and call the cleaning function on it

In [11]:
def ProcessFolder(root_directory, input_directory, output_directory):    
    #Create output directory (if it does not exist yet)
    if (not os.path.isdir(root_directory+'/'+output_directory)):
        os.mkdir(root_directory+'/'+output_directory)
    #Read labels (and file names)
    Truth = pd.read_csv(root_directory+'/'+input_directory+'/truth.txt', sep=":::", header=None, engine='python')
    #Iterate over all user names, and process the corresponding file names
    for i in range(0,Truth.shape[0]):
    #for i in range(0,2):        
        #Open text file for output        
        with codecs.open(root_directory+'/'+output_directory+'/'+Truth[0][i]+'.txt', "w", "utf-8-sig") as text_file:        
            #Load the tweets from the current xml to the Current_data array
            TweetGenerator = LoadProfile(root_directory+'/'+input_directory+'/'+Truth[0][i]+'.xml')
            for tweet in TweetGenerator:                                                        
                clean_tweet = CleanData(tweet['data'])                
                text_file.write(clean_tweet + '\n')        

In [12]:
#Try out the CleanData function
ProcessFolder('../pan19-author-profiling-training-2019-01-28/','en','en_clean')
