# Scraping evaluation using Gdelt v2 Events URLs (both EN + ML)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import sys
import csv
import nltk
import scrapy
import warnings
import unidecode
import numpy as np
import pandas as pd
import nest_asyncio
import plotly_express as px
import scrapy.crawler as crawler

from functools import partial
from newspaper import Article
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess
from multiprocessing import Process, Queue
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

nltk.download('punkt')
nest_asyncio.apply()

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

# pd.set_option('display.max_colwidth', -1)
pd.options.mode.chained_assignment = None

SPACE_REGEX = re.compile(r"\s+")
REGEX_TOKENIZER = re.compile(r'\w+')
LAT_LONG_REGEX = re.compile(r"[\#,]")

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
# ==================================
# Path to data, models, results
# ==================================
HOME_DIR = os.environ['HOME_PROJECT_X'] if 'HOME_PROJECT_X' in os.environ else r'C:\ProjectX'

WORKSPACE_ROOT = os.path.join(HOME_DIR, 'workspace')
PATH_TO_PROJECT_X_REPO = os.path.join(WORKSPACE_ROOT, 'project_x')
PATH_TO_DATA_ROOT_DIR = os.path.join(WORKSPACE_ROOT, "data")

In [None]:
# Add path to library to sys path
generic_utils_lib_dir = os.path.join(PATH_TO_PROJECT_X_REPO, 'common')

sys.path.extend([generic_utils_lib_dir])

from generic_utils import (downcast_datatypes, timing, create_output_dir, parallelize, create_output_dir)

In [None]:
UNICODE_REGEX = re.compile(r'[^\x00-\x7F]+', re.UNICODE)

### Auxiliary methods

In [None]:
def get_news_article(url, verbose=False):
    
    if verbose:
        print(url)
    
    article = Article(url) 
    article.download()   
    article.parse()
    article.nlp()
    
    if verbose:
        print(f"Authors: {article.authors}")
        print(f"Publish date: {article.publish_date}\n")
    
        print("=" * 40)
        print(article.text)
        print("=" * 40)

        print(f"Keywords: {article.keywords}")
        print(f"Summary: {article.summary}")
    
    return article.text


def squash_spaces(s, space_re=SPACE_REGEX) -> str:
    return re.sub(space_re, " ", s) if isinstance(s, str) else s


def preprocess_body_text(text, normalize=True):
    
    if text:
        # Replacing possible issues with data. We can add or reduce the replacement in this chain
        s = re.sub(r'\n+', ' ', str(text))
        s = squash_spaces(s).strip()

        if normalize:
            # Normalizing / encoding the text
            s = unidecode.unidecode(s)
        
        return s
    return text


def clean_lat_long(x):
    if isinstance(x, str):
        return float(re.sub(LAT_LONG_REGEX, "", x))
    return x
    

def line_contains_only_digits(x):
    return x.isdecimal() if isinstance(x, str) else True


def line_contain_chars(l):
    return any(re.findall("[A-Za-z]+", l)) if isinstance(l, str) else False


def line_contains_unicode(l):
    return any(UNICODE_REGEX.findall(l)) if isinstance(l, str) else False

In [None]:
http_error_codes = {
    '100': 'Informational - Continue',
    '101': 'Informational - Switching Protocols',
    '200': 'Successful - OK',
    '201': 'Successful - Created',
    '202': 'Successful - Accepted',
    '203': 'Successful - Non-Authoritative Information',
    '204': 'Successful - No Content',
    '205': 'Successful - Reset Content',
    '206': 'Successful - Partial Content',
    '300': 'Redirection - Multiple Choices',
    '301': 'Redirection - Moved Permanently',
    '302': 'Redirection - Found',
    '303': 'Redirection - See Other',
    '304': 'Redirection - Not Modified',
    '305': 'Redirection - Use Proxy',
    '307': 'Redirection - Temporary Redirect',
    '400': 'Client Error - Bad Request',
    '401': 'Client Error - Unauthorized',
    '402': 'Client Error - Payment Required',
    '403': 'Client Error - Forbidden',
    '404': 'Client Error - Not Found',
    '405': 'Client Error - Method Not Allowed',
    '406': 'Client Error - Not Acceptable',
    '407': 'Client Error - Proxy Authentication Required',
    '408': 'Client Error - Request Timeout',
    '409': 'Client Error - Conflict',
    '410': 'Client Error - Gone',
    '411': 'Client Error - Length Required',
    '412': 'Client Error - Precondition Failed',
    '413': 'Client Error - Request Entity Too Large',
    '414': 'Client Error - Request-URI Too Long',
    '415': 'Client Error - Unsupported Media Type',
    '416': 'Client Error - Requested Range Not Satisfiable',
    '417': 'Client Error - Expectation Failed',
    '500': 'Server Error - Internal Server Error',
    '501': 'Server Error - Not Implemented',
    '502': 'Server Error - Bad Gateway',
    '503': 'Server Error - Service Unavailable',
    '504': 'Server Error - Gateway Timeout',
    '505': 'Server Error - HTTP Version Not Supported',
}

---
# 1. Loading unique URLs from GDELT v2 Events EN + ML feeds
---

Data has been prepared in gdelt_v2_events_data_eda.ipynb notebook

In [None]:
start = pd.to_datetime('2020-01-01 00:00:00')
end = pd.to_datetime('2022-01-20 09:00:00')

path_to_gdelt = os.path.join(PATH_TO_DATA_ROOT_DIR, 'data_providers/gdelt')
path_to_scraped_data = os.path.join(path_to_gdelt, 'scraped_news')

In [None]:
fn = f'events_v02_en_ml_urls_{str(start.date())}_{str(end.date())}.parquet'

print(f"Loading GDELT v2.0 unique URLs from {os.path.join(path_to_gdelt, fn)}")
events_v02_grouped = pd.read_parquet(os.path.join(path_to_gdelt, fn),
                                     engine='auto',
                                     columns=None)

**Let's filter to more recent URLs**

In [None]:
unique_urls_recent = events_v02_grouped.loc[
    events_v02_grouped['dateadded_min'] > pd.to_datetime('2021-09-01 09:00:00'), 'sourceurl'].values

print(len(unique_urls_recent))

In [None]:
np.random.seed(18)
idx = np.random.random_integers(0, len(unique_urls_recent), 200000)
URLS = list(unique_urls_recent[idx])

# URLS = ['https://www.msn.com/en-us/news/world/enhancing-the-potential-of-people-with-disabilities/ar-AARJaWx']

## Example of URLs with different errors

# URLS = [
#     'https://www.insurancejournal.com/jobs/633146-inside-represented-moderate-casualty-adjuster-remote',
#     'https://omaha.com/news/national/mixed-feelings-in-el-paso-about-looser-texas-gun-limits/article_9fc209fc-6084-59de-a1f9-8987e7789fce.html',
#     'https://www.bizpacreview.com/2021/11/13/close-biden-ally-implies-president-may-not-seek-second-term-for-whatever-reason-boosts-kamala-1162706/',
#     'https://omaha.com/news/national/mixed-feelings-in-el-paso-about-looser-texas-gun-limits/article_9fc209fc-6084-59de-a1f9-8987e7789fce.html',
#     'http://www.independent.com.mt/articles/2021-09-25/blogs-opinions/Strong-words-in-September-6736236989',
#     'https://indiankanoon.org/doc/112114664/'
# ]

---
# 2. Scraping
---

It is based on original implementation by Jad (see https://github.com/BaseOperations/data-automation/blob/master/news_index/English-version/media_scraper/parse_text.py)

In [None]:
class NewsSpider(scrapy.Spider):
    name = "news_collection"
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
    handle_httpstatus_list = [100, 101, 200, 201, 202, 203, 204, 205, 206, 300, 301, 302, 303, 304, 305, 307, 400, 401, 402,
                              403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 500, 501, 502, 503, 
                              504, 505]

    def __init__(self, *args, **kwargs):
        
        super(NewsSpider, self).__init__(*args, **kwargs)
        
        self.start_urls = URLS
        
        self.path_to_scraped_data = path_to_scraped_data
        create_output_dir(self.path_to_scraped_data)
        
        self.filename = 'scraped_gdelt.csv'
        
        meta = {'download_timeout': 10}

    def parse(self, response, verbose=False):
        
        url = response.request.url
        status = response.status
            
        if status in [200, 201, 202, 203, 204, 205, 206]:
            
            tags = response.xpath("//meta[@property]").extract()
            final_tags = dict()
            for tag in tags:

                if len(re.findall("og:", tag)) > 0:

                    if re.search("property=\"og:", tag) is None or re.search("content=\"", tag) is None:
                        continue

                    prop_first = re.search("property=\"og:", tag).span()[1]
                    cont = re.search("content=\"", tag).span()[1]
                    prop_end = re.search("content=\"", tag).span()[0] - 2

                    cont_first = cont 
                    cont_end = len(tag) - 2

                    proper = tag[prop_first:prop_end]
                    content = tag[cont_first:cont_end]
                    final_tags[proper] = str(content)

            if response.xpath('//article'):
                paragraphs = response.xpath('//article').css('p::text').getall()
            else:
                paragraphs = response.css('p::text').getall()

            if verbose:
                print("\n======================")
                print(f"URL: {url}")
                print(f"Status: {status}")
                print(f"Tags: {final_tags}")
                print(f"Body: {paragraphs}")

        else:
            final_tags = {}
            paragraphs = []
            
            if verbose:
                print("\n======================")
                print(f"URL: {url}")
                print(f"Status: {status}")
                print(f"Tags: {final_tags}")
                print(f"Body: {paragraphs}")

            
        with open(os.path.join(self.path_to_scraped_data, self.filename), 'a', newline='') as myfile:
            writer = csv.writer(myfile)
            writer.writerow([url, status, paragraphs, final_tags])

def run_spider(spider):
    """
    The wrapper to make it run more times
    """
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()
    
    if result is not None:
        raise result

In [None]:
%%time

run_scraping = False

if run_scraping:
    run_spider(NewsSpider)

- 1000 - 1min 15s
- 10000 - 23 min
- 200000 - approx 7h

### Loading scraped URLs

In [None]:
filename = 'scraped_gdelt.csv'

scraping_analysis = pd.read_csv(os.path.join(path_to_scraped_data, filename), header=None,
                                names=['url', 'status', 'paragraphs', 'final_tags'])

scraping_analysis.drop_duplicates(subset=['url'], inplace=True)

scraping_analysis['error_code'] = scraping_analysis['status'].astype(str) \
                                  + ' - ' \
                                  + scraping_analysis['status'].astype(str).map(http_error_codes)

scraping_analysis['paragraphs'] = scraping_analysis['paragraphs'].map(lambda x: eval(x) if x else np.nan)
mask = scraping_analysis['paragraphs'].notnull()
scraping_analysis.loc[mask, 'paragraphs'] = scraping_analysis.loc[mask, 'paragraphs'].map(lambda x: ' '.join(x).strip())

# Apply light preprocessing
preprocess_body_news_simple = partial(preprocess_body_text, normalize=False)
scraping_analysis['paragraphs'] = scraping_analysis['paragraphs'].apply(preprocess_body_news_simple)
scraping_analysis['paragraphs'] = scraping_analysis['paragraphs'].replace({'': np.nan, '❌': np.nan}) 

# Len of news body
mask = scraping_analysis['paragraphs'].notnull()
# scraping_analysis.loc[mask, 'paragraphs_nchars'] = scraping_analysis.loc[mask, 'paragraphs'].str.len()
scraping_analysis.loc[mask, 'paragraphs_nwords'] = scraping_analysis.loc[mask, 'paragraphs'].str.split().apply(len)

# Adding source name
scraping_analysis['source_name'] = \
    scraping_analysis['url'].str.split('://').str[1].str.split('/').str[0].str.split("\?|\:").str[0]

cols_order = [
    'source_name', 'url', 'status', 'paragraphs', 'final_tags', 'error_code', 'paragraphs_nwords'
]
scraping_analysis = scraping_analysis[cols_order]

print(f"Number of news scraped: {scraping_analysis.shape[0]}")

print(f"PCT of news with no text (prior actually nullifying news with garbage text content): "
      f"{scraping_analysis['paragraphs'].isnull().sum() / scraping_analysis.shape[0]} ")

print(f"Number of unique sources covered: {scraping_analysis['source_name'].nunique()}")

scraping_analysis.sample(5, random_state=2017)

### Error analysis

In [None]:
scraping_analysis['error_code'].value_counts(normalize=True, dropna=False)

### Likely problematic scraping

In [None]:
grouped = scraping_analysis.groupby('paragraphs').agg(
    {'url': ['count', 'nunique', set],
     'source_name': ['nunique', set],
     'paragraphs_nwords': ['first']}).reset_index()

grouped.columns = ["_".join(filter(lambda col: col, col)) for col in grouped.columns.ravel()]
grouped = grouped[grouped['url_count'] > 1].sort_values('url_count', ascending=False).reset_index(drop=True)

# Likely garbage text in news' body
likely_scraper_problem = grouped['paragraphs'].tolist()

grouped

In [None]:
scraping_analysis_filtered = scraping_analysis.copy()

mask = scraping_analysis_filtered['paragraphs'].isin(likely_scraper_problem)
print(f"Number of likely problematic scraping: {scraping_analysis_filtered[mask].shape}")

scraping_analysis_filtered.loc[mask, 'paragraphs'] = np.nan
scraping_analysis_filtered.loc[mask, 'paragraphs_nwords'] = np.nan
scraping_analysis_filtered.loc[mask, 'paragraphs_nwords_bins'] = np.nan

print(scraping_analysis_filtered['paragraphs'].isnull().sum() / scraping_analysis_filtered.shape[0])

In [None]:
# df = scraping_analysis
df = scraping_analysis_filtered

mask = df['paragraphs_nwords'] <= 200

print(df.loc[mask].shape)
df.loc[mask, 'paragraphs'].value_counts().head(100)

In [None]:
# df = scraping_analysis
df = scraping_analysis_filtered

news_text_stats = df['paragraphs_nwords'].value_counts().reset_index().rename(
    columns={'index': 'paragraphs_nwords', 'paragraphs_nwords': 'count'}
)

n_tokens_bins = \
    [b for b in [0, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000] if b <= news_text_stats['paragraphs_nwords'].max()] + \
    [news_text_stats['paragraphs_nwords'].max()]

news_text_stats['paragraphs_nwords_bins'] = pd.cut(news_text_stats['paragraphs_nwords'], 
                                                   n_tokens_bins, retbins=True, precision=1)[0]

grouped = news_text_stats.groupby('paragraphs_nwords_bins').agg({'count': ['sum']}).reset_index()
grouped.columns = ["_".join(filter(lambda col: col, col)) for col in grouped.columns.ravel()]
grouped.rename(columns={'count_sum': 'number_of_records'}, inplace=True)

grouped['pct_of_records'] = (grouped['number_of_records'] / news_text_stats['count'].sum()).round(2)
grouped['paragraphs_nwords_bins'] = grouped['paragraphs_nwords_bins'].astype(str)

fig = px.bar(grouped, x='paragraphs_nwords_bins', y='pct_of_records', height=350,
             text='pct_of_records', 
             title=f'PCT of records having number of words in news within the corresponding bin')
fig.show()

In [None]:
mask = scraping_analysis_filtered['paragraphs_nwords'] <= 20

print(scraping_analysis_filtered[mask].shape)

scraping_analysis_filtered[mask].sort_values('paragraphs_nwords').head(200)#['paragraphs'].tolist()

---
# Appendix - original code of Jad to scrape news
- doesn't work in Jupyter environment
---

In [None]:
class news_collection(scrapy.Spider):
    name = "news_collection"

    def __init__(self, *args, **kwargs):
        super(news_collection, self).__init__(*args, **kwargs)
        self.start_urls = kwargs.get("urls")
        meta = {'download_timeout': 10}


    def parse(self, response):
    
        tags = response.xpath("//meta[@property]").extract()
        final_tags = dict()
        for tag in tags:
            
            if len(re.findall("og:", tag)) > 0:
                
                if re.search("property=\"og:", tag) is None or re.search("content=\"", tag) is None:
                    continue
                    
                prop_first = re.search("property=\"og:", tag).span()[1]
                cont = re.search("content=\"", tag).span()[1]
                prop_end = re.search("content=\"", tag).span()[0] - 2
                
                cont_first = cont 
                cont_end = len(tag)-2
                
                proper = tag[prop_first:prop_end]
                content = tag[cont_first:cont_end]
                final_tags[proper] = str(content)

        if response.xpath('//article'):
            paragraphs = response.xpath('//article').css('p::text').getall()
        else:
            paragraphs = response.css('p::text').getall()
        
        print("\n======================")
        print(f"Tags: {final_tags}")
        print(f"Body: {paragraphs}")
        # print(f"URL: {URL}")
        
#         with open('/tmp/urls_list.json', 'r') as r:
#             data = json.load(r)

#         url = response.request.url
#         for item in data:
#             print(item)
#             if item['URL'] == url:
#                 item['Tags'] = final_tags
#                 item['Body'] = paragraphs
#                 item['URL'] = url
#                 with open('/tmp/urls_list.json', 'w') as a:
#                     json.dump(data, a, indent=2)

In [None]:
run_scraper = False

if run_scraper:
    process = CrawlerProcess()
    a = process.crawl(news_collection, urls=URLS)
    process.start(stop_after_crawl=False)