Importing necesssary libraries

In [2]:
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import spacy
import os
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from geopy.geocoders import Nominatim


In [3]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


defining an es object

In [4]:

nlp = spacy.load('en_core_web_sm')


Creating the mapping and the setting for the index

In [63]:
index_mapping = {
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete",
            },
            "content": {
                "type": "text",
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "name": {"type": "text"},
                },
            },
            "date": {
                "type": "date",
            },
            "geopoint": {
                "type": "geo_point",
            },
            "temporalExpressions": {
                "type": "text",
            },
            "georeferences": {
                "type": "text",
            },
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "autocomplete": {
                    "tokenizer": "autocomplete",
                    "filter": ["lowercase"]
                },
                "autocomplete_search": {
                    "tokenizer": "lowercase"
                }
            },
            "tokenizer": {
                "autocomplete": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 25,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    }
}
es = Elasticsearch(['http://localhost:9200/'])
index_name = "my_index"
es.indices.create(index=index_name, body=index_mapping)


BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [my_index/U1GMUIXcSfytVbOgsMyQfQ] already exists')

Function to extract the of publishing the article

In [56]:
def extract_date_of_publish(reuters_tag):
    date_tags = reuters_tag.find_all('date')
    
    if date_tags:
        date_tag = date_tags[0]
        date_str = date_tag.text.strip()
        
        
        date_obj = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S.%f")


    return date_obj
    

Function to extract the title


In [7]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    title = title_tag.text.strip() if title_tag else None
    return title


Function to extract the temporal expressions in the article

In [70]:
def extract_temporal_expressions(tag):
    body_tag = tag.find('body')
    
    if body_tag:
        content = body_tag.text.strip()
        doc = nlp(content)
        temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        return temporal_expressions
    else:
        return None 


Function to extract the authors of the article

In [60]:
def extract_authors(article_tag):
    authors_tags = article_tag.find_all('author')
    
    authors_list = []
    for author_tag in authors_tags:
        author_name = author_tag.text.strip()
        authors_list.append({"name": author_name})
    
    return authors_list if authors_list else None


Function to extract the georeferences in the article

In [10]:
def extract_georeferences(tag):
    content_tag = tag.find('body')
    content = content_tag.text.strip() if content_tag else None

    if content:
        doc = nlp(content)
        georeferences = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
        return georeferences
    else:
        return 

Function to extract and pre proccesing the content

In [11]:
def preprocess_body(article_tag):
    body_tag = article_tag.find('body')
    
    body_text = BeautifulSoup(str(body_tag), 'html.parser').get_text()

    # tokenize the content
    words = re.findall(r'\b\w+\b', body_text.lower())

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and len(word) >= 3]

    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in filtered_words]

    # join the processed words back into a string
    processed_body = ' '.join(stemmed_words)

    return processed_body
  

Function to convert the geo references into geopoints

In [12]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="your_user_agent_here")
    coordinates = []

    for georef in georeferences:
        if georef and isinstance(georef, str):
            location = geolocator.geocode(georef)
            if location and hasattr(location, 'latitude') and hasattr(location, 'longitude'):
                coordinates.append((location.latitude, location.longitude))

    return coordinates


Function to extract and index all attributes


In [66]:
def index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences):
    try:
        # Ensure that required fields have valid values before indexing
        if title and content and date and geopoint:
            document = {
                'title': title,
                'content': content,
                'authors': authors,
                'date': date,
                'geopoint': geopoint,
                'temporalExpressions': temporal_expressions,
                'georeferences': georeferences
            }

            # Index the document
            es.index(index='test_index', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")

# Example usage:
# index_document("Sample Title", "Sample Content", ["Author1", "Author2"], "2023-01-01", "40.7128,-74.0060", "Some Temporal Expressions", "Some Georeferences")


In [71]:
def process_sgm_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            process_sgm_file(file_path)

def process_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    for reuters_tag in reuters_tags:      
        date=extract_date_of_publish(reuters_tag)
        print(date)
        authors=extract_authors(reuters_tag)
        print(authors)
        title = extract_article_title(reuters_tag)
        print(title)
        body = preprocess_body(reuters_tag)
        print(body)
        georeferences = extract_georeferences(reuters_tag)
        print(georeferences)
        temporal_expressions = extract_temporal_expressions(reuters_tag)
        if georeferences is not None:
            geopoints = extract_geopoints(georeferences)
            print(geopoints)
        
        #index_document(title, body, authors, date, geopoints, temporal_expressions, georeferences)        

data = "test_data"
process_sgm_folder(data)


1987-02-26 15:01:01.790000
None
BAHIA COCOA REVIEW
shower continu throughout week bahia cocoa zone allevi drought sinc earli januari improv prospect come temporao although normal humid level restor comissaria smith said weekli review dri period mean temporao late year arriv week end februari 155 221 bag kilo make cumul total season mln stage last year seem cocoa deliv earlier consign includ arriv figur comissaria smith said still doubt much old crop cocoa still avail harvest practic come end total bahia crop estim around mln bag sale stand almost mln hundr thousand bag still hand farmer middlemen export processor doubt much cocoa would fit export shipper experienc dificulti obtain bahia superior certif view lower qualiti recent week farmer sold good part cocoa held consign comissaria smith said spot bean price rose 340 350 cruzado per arroba kilo bean shipper reluct offer nearbi shipment limit sale book march shipment 750 780 dlr per tonn port name new crop sale also light open port ju