In [2]:
#---------------------------------------------------------------------
#This code reads a .json file with Twitter data, cleans it and
#writes cleaned data into a file
#---------------------------------------------------------------------

import re
import ijson
from nltk.corpus import words

#---------------------------------------------------------------------
#Function for splitting sentences
#---------------------------------------------------------------------

def split_into_sentences(text):
    caps = "([A-Z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Rev|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    text = " " + text + "  "
    text = text.replace("\n\n"," ")
    text = text.replace("\n"," ")
    text = text.replace("amp&;", "&")
    text = text.replace("&amp;", "&")
    text = text.replace("p.m.", "P.M.")
    text = text.replace("a.m.", "A.M.")
    text = re.sub(r"http.*", "", text)
    text = re.sub(r"www.*", "", text)
    text = re.sub(r"#.*", "", text) 
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    #print("3 - " + text)
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

#---------------------------------------------------------------------
#Make .json files readable for ijson
#---------------------------------------------------------------------

path = '..\\data\\'
input_filename = 'twitter_stream_example.json'
output_filename = 'twitter_stream_example_edited.json'
final_topic_corpus_name = output_filename + '.txt'

input_json_file = path + input_filename
output_json_file = path + output_filename

output = []

with open(input_json_file) as infile, open(output_json_file, 'w') as outfile:
    for idx, line in enumerate(infile):
        if len(line) < 100:
            pass
        else:
            if idx == 0:
                output.append('[' + line.strip() + ',')
            else:
                output.append(line.strip() + ',')
                
    output[len(output)-1] = output[len(output)-1][:len(output[len(output)-1])-1] + ']' #remove last , and add ]

    for item in output:
        print(item, file=outfile)
        
#---------------------------------------------------------------------
#Reading a .json file and work with the data with ijson
#---------------------------------------------------------------------

topic_corpus_file = output_json_file

with open(topic_corpus_file, 'r') as f:
    objects = ijson.items(f, 'item')
    columns = list(objects)

raw_topic_tweets=[]
for col in columns:
    if col["user"]["friends_count"]>300 and col["user"]["friends_count"]<1500 and col["user"]["followers_count"]>300:
        try:
            raw_topic_tweets.append(col["extended_tweet"]["full_text"]) #full text tweet of all tweets that are no RT
        except KeyError:
            continue
            
#---------------------------------------------------------------------
#Clean topic corpus and split into sentences
#---------------------------------------------------------------------

topic_sentences = []
raw_topic_corpus = []

#create list of sentences out of tweets
for tweet in raw_topic_tweets:
    if 'deal' not in str(tweet) and '24' not in str(tweet) and 'order' not in str(tweet) and 'price' not in str(tweet) and '$' not in str(tweet) and 'sale' not in str(tweet): topic_sentences.append(split_into_sentences(tweet))
    
#flatten list of topic sentences
flat_topic_corpus = [item for sublist in topic_sentences for item in sublist]

#handle the multiple use of ".", "?" and "!" and p.m. a.m.
numbers = re.compile("[0-9]")
number_plus_dot = re.compile("[0-9]\.") 

#clean output
for index, sentence in enumerate(flat_topic_corpus):
    if sentence == ".": #add separated multiple dots back to the sentence
        if raw_topic_corpus[-1][-1]== ".":
            raw_topic_corpus[-1]=raw_topic_corpus[-1]+"."
    elif sentence =="!":  #add separated multiple "!" back to the sentence
        raw_topic_corpus[-1]=raw_topic_corpus[-1]+"!"
    elif sentence =="?":  #add separated multiple "?" back to the sentence
        raw_topic_corpus[-1]=raw_topic_corpus[-1]+"?"
    elif numbers.match(sentence[0]): #if sentence starts with number and the last sentence ends with a number and dot then merge
        if number_plus_dot.match(raw_topic_corpus[-1][-2:]):
            raw_topic_corpus[-1] = raw_topic_corpus[-1] + sentence
    else:
        raw_topic_corpus.append(sentence)
        
#---------------------------------------------------------------------
#Write topic corpus to file
#---------------------------------------------------------------------

remove_url = re.compile("https:*")
file = open('..\\data\\' + final_topic_corpus_name, 'w', encoding = 'utf-8') 
    
for sentence in raw_topic_corpus:
    check_latin = True
    try: 
        sentence.encode('ascii')
    except UnicodeEncodeError: 
        check_latin = False
        
    if check_latin == True: #only use sentences written in latin
        if len(sentence) > 20: #clean short spamlike tweets
            for word in sentence.split():
                if remove_url.match(word): #remove links
                    continue
                elif word == '&amp;': #correct '&' symbols
                    file.write("& ")
                elif word[:1] == '@': #remove @persons
                    file.write("")
                else:
                    #EMOJI-FIX
                    try: #remove emojiis
                        file.write(word + " ")
                    except UnicodeEncodeError:
                        file.write("")
            file.write('\n')
file.close()