In [1]:
#---------------------------------------------------------------------
#This code cleans and splits the Trump corpus
#---------------------------------------------------------------------

import re

#---------------------------------------------------------------------
#Function for splitting sentences
#---------------------------------------------------------------------

def split_into_sentences(text):
    caps = "([A-Z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Rev|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    text = " " + text + "  "
    text = text.replace("\n\n"," ")
    text = text.replace("\n"," ")
    text = text.replace("amp&;", "&")
    text = text.replace("&amp;", "&")
    text = text.replace("p.m.", "P.M.")
    text = text.replace("a.m.", "A.M.")
    text = re.sub(r"http.*", "", text)
    text = re.sub(r"www.*", "", text)
    text = re.sub(r"#.*", "", text) 
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    #print("3 - " + text)
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

#---------------------------------------------------------------------
#Clean the Trump corpus
#---------------------------------------------------------------------

trump_corpus_file = open('..\\data\\trump_corpus_14012018.txt', encoding = 'utf-8')
trump_corpus = trump_corpus_file.read()

trump_list = eval(trump_corpus) #list of dicts

trump_tweets = [] #from newest to oldest tweet
rev_trump_tweets = [] #from oldest to newest tweet
cleaned_trump_corpus = []
trump_sentences = []
final_trump_corpus = []

#create list of Trump tweets
for tweet in trump_list[:-2000]: #the first 2000 tweets mostly not by Trump himself
    
    #remove retweets & create list of tweets
    if tweet['text'][0:2] != "RT" and tweet['text'][0:2] != "RE" and tweet['text'][0]!="@":
        trump_tweets.append(tweet['text'])

#merges ... separated tweets
for tweet in reversed(trump_tweets):
    rev_trump_tweets.append(tweet)

for index, tweet in enumerate(rev_trump_tweets):
    if tweet[-3:] == "...":
        if rev_trump_tweets[index+1][0:3] == "...":

            dot_counter = 0 #counts the number of tweet separation dots

            for letter in reversed(tweet):
                if letter == ".":
                    dot_counter += 1
                else:
                    break

            rev_trump_tweets[index+1] = tweet[0:len(tweet)-dot_counter] + ' ' + rev_trump_tweets[index+1][dot_counter:]
            rev_trump_tweets[index] = "&DELETE&" #mark for later deletion

#remove the marked tweets and clean the tweets
for tweet in rev_trump_tweets:
    if tweet != "&DELETE&" and tweet[:2]!='“@' and tweet[:2]!='"@' and tweet[:3]!='Via' and tweet[:1]!='“' and tweet[:1]!='"' and tweet[:1]!="@": #remove, marked tweets & things trump replied to or posted (because mostly just "thanks")
        cleaned_trump_corpus.append(tweet)
        
#---------------------------------------------------------------------
#Split into sentences
#---------------------------------------------------------------------

#create list of Trump sentences out of tweets
for tweet in cleaned_trump_corpus:
    trump_sentences.append(split_into_sentences(tweet))

#flatten list of Trump sentences
flat_trump_corpus = [item for sublist in trump_sentences for item in sublist]

#handle the multiple use of ".", "?" and "!" and p.m. a.m.
numbers = re.compile("[0-9]")
number_plus_dot = re.compile("[0-9]\.") 

#clean output
for index, sentence in enumerate(flat_trump_corpus):
    if sentence == ".": #add separated multiple dots back to the sentence
        if final_trump_corpus[-1][-1]== ".":
            final_trump_corpus[-1]=final_trump_corpus[-1]+"."
    elif sentence =="!":  #add separated multiple "!" back to the sentence
        final_trump_corpus[-1]=final_trump_corpus[-1]+"!"
    elif sentence =="?":  #add separated multiple "?" back to the sentence
        final_trump_corpus[-1]=final_trump_corpus[-1]+"?"
    elif numbers.match(sentence[0]): #if sentence starts with number and the last sentence ends with a number and dot then merge
        if number_plus_dot.match(final_trump_corpus[-1][-2:]):
            final_trump_corpus[-1] = final_trump_corpus[-1] + sentence
    else:
        final_trump_corpus.append(sentence)

#write sentences into file
trump_file_output = open("..\\data\\final_trump_corpus.txt","w",)
for sentence in final_trump_corpus:
    try:
        trump_file_output.write(str(sentence)+"\n")
    except UnicodeEncodeError:
        continue
trump_file_output.close() 