In [1]:
import pandas as pd
from tqdm import tqdm
import re
import spacy
from unidecode import unidecode
import csv
# import nltk.data

nlp = spacy.load('en')
# tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

In [2]:
dsun = pd.read_pickle('Data/Daily Sun/DailySun_ent_1.pkl')

In [3]:
def remove_smart_punctuations(text):
    return unidecode(text)
#     text.replace("‘", "\'").replace("’", "\'").replace("“", "\"").replace("”", "\"").replace("–", "-")

In [4]:
def remove_smart_from_contracted(text):
    new_text = text.replace("’m", "'m").replace("’s", "'s").replace("’re", "'re").replace("’ve", "'ve").replace("’t", "'t").replace("’d", "'d").replace("’ll", "'ll")
    new_text = new_text.replace("’", "\"").replace("‘", "\"")
    return new_text

In [5]:
def check_alleged(text):
    if re.search(r'\balleged\b', text):
        return True
    return False

conversational_words = ['said', 'told', 'asked', 'speak', 'say', 'tell', 'spoke', 'added', 'declare']

In [6]:
def fix_quotes(line):
    line = line

    if "\n" in line:
        arr = line.split("\n")
        i = 0
        prev_sentence_has_double_quote = False
        while i < len(arr):
            if arr[i] == "":
                i += 2  # Each of the paragraphs are separated by two new lines so i is incremented by 2. 
                continue

            if arr[i].count("\"") == 1: # If a paragraph has a line like ~ ABC said, 'DEF is good. ~ it means the quotation continues in the next paragraph since it exists in pairs and the next double quote will be found in the next paragraph
                if prev_sentence_has_double_quote is False:
                    prev_sentence_has_double_quote = True
                    i += 2
                    continue

            if arr[i][0] == '"': # Matching pair of the quotation mark found
                if prev_sentence_has_double_quote is True:
                    s = list(arr[i])
                    s[0] = ' '
                    arr[i] = "".join(s)
                    arr[i - 2:i + 1] = [''.join(arr[i - 2:i + 1])]  # Concatenating the two paragraphs.
                    prev_sentence_has_double_quote = False
                    i += 2
                    continue

            if prev_sentence_has_double_quote is True:
                prev_sentence_has_double_quote = False
            i += 2

        for i in reversed(arr):
            if i == " " or i == '':
                arr.pop()
            else:
                break

        # Final output being placed in str
        final_str = ""
        for i in range(0, len(arr)):
            if(arr[i] == ' ' or arr[i] == ''):
                final_str += "\n" + "\n"
            else:
                final_str += arr[i]

        return(final_str)

In [7]:
def fix_space_after_dot(text):
    rx = r"\.(?=\S)"
    fixed = re.sub(rx, ". ", text)
    return fixed

In [8]:
def fix_multiple_space(text):
  fixed = re.sub(' +',' ', text)
  return fixed

In [9]:
def find_quotes(text):

    regx = re.compile('([^\.\?\!]*?".+?".*?[\.\?\!])|([^\.\?\!]*?".+?[\.\?\!]")')

    array = (regx.findall(text))

    quoted_sentence = []
    for elem in array:
        if elem[0] == '':
            quoted_sentence.append(elem[1].strip())
        elif elem[1] == '':
            quoted_sentence.append(elem[0].strip())

    return quoted_sentence

In [10]:
def entities_in_quote(content, locations, organizations, persons):
    loc, org, per, entities = [], [], [], []
    if any(entity in content for entity in locations) or any(entity in content for entity in organizations) or any(entity in content for entity in persons):
        
        for entity in locations:
            if entity in content:
                loc.append(entity)
                
        for entity in organizations:
            if entity in content:
                org.append(entity)
                
        for entity in persons:
            if entity in content:
                per.append(entity)

    entities = [loc, org, per]
    if entities:
        return content, loc, org, per

In [11]:
dsun.shape

(328748, 12)

### Column names in source:


['category', 'date_published', 'image', 'news_content', 'news_id',
       'newspaper', 'reporter', 'tags', 'title', 'url', 'location_entities',
       'organization_entities', 'person_entities']

In [12]:
# df = pd.DataFrame(columns=['_id', 'timestamp', 'quote', 'locations', 'organizations', 'persons', 'headline', 'keywords', 'category'])

# # counter = 0
# for index, row in tqdm(dsun.iterrows(), total=dsun.shape[0]):

# #     counter += 1
# #     if counter == 100:
# #         break
#     try:
#         _id = row['news_id']
#         timestamp = row['date_published']
#         content = row['news_content']
#         locations = row['location_entities']
#         organizations = row['organization_entities']
#         persons = row['person_entities']
#         headline = row['title']
#         keywords = row['tags']
#         category = row['category']
#         if content:
#             # content = fix_quotes(remove_smart_punctuations(content)) # fixes the problem with non-matching quotation marks due to continutation in next para
#             # content = fix_space_after_dot(content)
#             content = remove_smart_punctuations(remove_smart_from_contracted(content))
#             quotes = find_quotes(content) # finds all the quotes in the content as a list

#             # Checks for quotation marks
#             for quote in quotes:
#                 if quote:
#                     text, loc, org, per = entities_in_quote(quote, locations, organizations, persons)

#                 if (len(loc + org + per) >= 2):
#                     df = df.append(pd.Series([_id, timestamp, text, loc, org, per, headline, keywords, category],
#                                             index = ['_id', 'timestamp', 'quote', 'locations', 'organizations', 'persons', 'headline', 'keywords', 'category']),  ignore_index=True)

#             # Checks for conversational words, ignoring 
#             doc = nlp(content)
#             period_delimited_list = [sent.string.strip() for sent in doc.sents]

#             for sentence in period_delimited_list:
#                 if any(word in sentence for word in conversational_words) or check_alleged(sentence):
#                     if quotes:
#                         sentence, loc, org, per = entities_in_quote(
#                                     sentence, locations, organizations, persons)
#                         if not any(sentence in s for s in quotes) and len(loc + org + per) >= 2:
#                                 df = df.append(pd.Series([_id, timestamp, sentence, loc, org, per, headline, keywords, category],
#                                             index = ['_id', 'timestamp', 'quote', 'locations', 'organizations', 'persons', 'headline', 'keywords', 'category']),  ignore_index=True)
#     except:
#         print(index)

In [13]:
with open('Data/Processed Data/Quotations/DailySun-quotations.tsv', 'w') as outfile:
    writer = csv.writer(outfile, delimiter='\t')
    writer.writerow(['_id', 'timestamp', 'quote', 'locations', 'organizations', 'persons', 'section'])
    

# counter = 0
for index, row in tqdm(dsun.iterrows(), total=dsun.shape[0]):
    
#     if counter == 1000:
#         break
#     counter += 1
    
    
    try:
        _id = row['_id']
        timestamp = row['date_published']
        content = row['news_content']
        locations = row['location_entities']
        organizations = row['organization_entities']
        persons = row['person_entities']
        section = row['section']
        if content:
            # content = fix_quotes(remove_smart_punctuations(content)) # fixes the problem with non-matching quotation marks due to continutation in next para
            content = fix_multiple_space(content).replace("\\","")
            quotes = find_quotes(content) # finds all the quotes in the content as a list

            # Checks for quotation marks
            for quote in quotes:
                if quote:
                    text, loc, org, per = entities_in_quote(quote, locations, organizations, persons)

                if (len(loc + org + per) >= 2):
                  
                  with open('Data/Processed Data/Quotations/DailySun-quotations.tsv', 'a') as outfile:
                    writer = csv.writer(outfile, delimiter='\t')
                    writer.writerow([_id, timestamp, text, loc, org, per, section])
                 
                    
            # Checks for conversational words, ignoring 
            doc = nlp(content)
            period_delimited_list = [sent.string.strip() for sent in doc.sents]

            for sentence in period_delimited_list:
                if any(word in sentence for word in conversational_words) or check_alleged(sentence):
                    if quotes:
                        sentence, loc, org, per = entities_in_quote(
                                    sentence, locations, organizations, persons)
                        if not any(sentence in s for s in quotes) and len(loc + org + per) >= 2:
                          

                            with open('Data/Processed Data/Quotations/DailySun-quotations.tsv', 'a') as outfile:
                                writer = csv.writer(outfile, delimiter='\t')
                                writer.writerow([_id, timestamp, sentence, loc, org, per, section])
    except:
        print(index)

100%|██████████| 328748/328748 [11:43:10<00:00,  7.79it/s]
