Importing necesssary libraries

In [30]:
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import spacy
import os

In [None]:
spacy.cli.download("en_core_web_sm")

defining an es object

In [23]:
es = Elasticsearch(['http://localhost:9200/'])
nlp = spacy.load('en_core_web_sm')


Creating the mapping and the setting for the index

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0  
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "content": {"type": "text"},
            "authors": {
                "type": "nested",
                "properties": {
                    "first_name": {"type": "text"},
                    "last_name": {"type": "text"},
                    "email": {"type": "keyword"}
                }
            },
            "date": {"type": "date"},
            "geopoint": {"type": "geo_point"},
            "temporalExpressions": {"type": "text"},
            "georeferences": {"type": "text"}
        }
    }
}

index_name = "test_index"
es.indices.create(index=index_name, body=index_settings)

Function to extract the of publishing the article

In [37]:
def extract_date_of_publish(reuters_tag):
    date_tag = reuters_tag.find('date')
    date = date_tag.text.strip() if date_tag else None
    return date


Function to extract the temporal expressions in the article

In [44]:
def extract_temporal_expressions(tag):
    body_tag = tag.find('body')
    content = body_tag.text.strip() if body_tag else None

    doc = nlp(content)
    temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
    return temporal_expressions



Function to extract the georeferences in the article

In [46]:
def extract_georeferences(tag):
    places_tag = tag.find('places')
    content = ' '.join([place.text.lower() for place in places_tag.find_all('d')]) if places_tag else None

    doc = nlp(content)
    georeferences = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    return georeferences

Function to extract and index all attributes


In [59]:
from datetime import datetime

def index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences):
    # Convert the date to ISO 8601 format
    iso_date = None
    if date:
        date_obj = datetime.strptime(date, '%d-%b-%Y %H:%M:%S.%f')
        iso_date = date_obj.isoformat()

    document = {
        'title': title,
        'content': content,
        'authors': authors,
        'date': iso_date,
        'geopoint': geopoint,
        'temporalExpressions': temporal_expressions,
        'georeferences': georeferences
    }

    # Index the document
    es.index(index='test_index', body=document)


In [87]:
def process_sgm_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            process_sgm_file(file_path)

def process_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'lxml')
    reuters_tags = soup.find_all('reuters')
    for reuters_tag in reuters_tags:
        title_tag = reuters_tag.find('title')
        title = title_tag.text.strip() if title_tag else None

        if content:
            authors = [author.text.strip() for author in reuters_tag.find_all('author')] if reuters_tag.find_all('author') else None
            print(authors)
            date = extract_date_of_publish(reuters_tag)
            print(date)
            geopoint = reuters_tag.find('geopoint').text.strip() if reuters_tag.find('geopoint') else None
            print(geopoint)
            #temporal_expressions = extract_temporal_expressions(content)
            #print(temporal_expressions)
            #georeferences = extract_georeferences(content)
            #print(georeferences)
            #index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences)

data = "C://Users//USER//Desktop//elasticSearch project//test_data"
process_sgm_folder(data)

None
26-FEB-1987 15:01:01.79
None
None
26-FEB-1987 15:02:20.00
None
None
26-FEB-1987 15:03:27.51
None
['by Janie Gabbett, Reuters']
26-FEB-1987 15:07:13.72
None
None
26-FEB-1987 15:10:44.60
None
None
26-FEB-1987 15:14:36.41
None
None
26-FEB-1987 15:14:42.83
None
None
26-FEB-1987 15:15:40.12
None
None
26-FEB-1987 15:17:11.20
None
None
26-FEB-1987 15:18:06.67
None
None
26-FEB-1987 15:18:59.34
None
None
26-FEB-1987 15:19:15.45
None
None
26-FEB-1987 15:20:13.09
None
None
26-FEB-1987 15:20:27.17
None
None
26-FEB-1987 15:20:48.43
None
['by Janie Gabbett, Reuters']
26-FEB-1987 15:21:16.13
None
None
26-FEB-1987 15:24:48.56
None
None
26-FEB-1987 15:26:26.78
None
None
26-FEB-1987 15:26:54.12
None
None
26-FEB-1987 15:32:03.12
None
None
26-FEB-1987 15:33:23.61
None
None
26-FEB-1987 15:34:07.03
None
None
26-FEB-1987 15:34:16.30
None
None
26-FEB-1987 15:35:16.67
None
None
26-FEB-1987 15:35:39.38
None
None
26-FEB-1987 15:36:44.78
None
None
26-FEB-1987 15:36:53.42
None
['By Michael Gelb, Reuters']
26-