Importing necesssary libraries

In [None]:
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import spacy
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from geopy.geocoders import Nominatim


In [None]:
#spacy.cli.download("en_core_web_sm")

defining an es object

In [None]:

nlp = spacy.load('en_core_web_sm')
es = Elasticsearch(['http://localhost:9200/'])


Creating the mapping and the setting for the index

In [None]:
index_mapping = {
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete",
            },
            "content": {
                "type": "text",
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "name": {"type": "text"},
                },
            },
            "date": {
                "type": "date",
            },
            "geopoint": {
                "type": "geo_point",
            },
            "temporalExpressions": {
                "type": "text",
            },
            "georeferences": {
                "type": "text",
            },
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "autocomplete": {
                    "tokenizer": "autocomplete",
                    "filter": ["lowercase"]
                },
                "autocomplete_search": {
                    "tokenizer": "lowercase"
                }
            },
            "tokenizer": {
                "autocomplete": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 25,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    }
}
index_name = "new_index"
es.indices.create(index=index_name, body=index_mapping)


Function to extract the of publishing the article

In [None]:
from datetime import datetime

def extract_date_of_publish(reuters_tag):
    date_obj = None
    
    date_tags = reuters_tag.find_all('date')
    
    if date_tags:
        date_tag = date_tags[0]
        date_str = date_tag.text.strip()

        try:
            date_obj = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S.%f")
        except ValueError:
            # Handle the case where the date string is not in the expected format
            print(f"Error: Unable to parse date string '{date_str}'")
    
    return date_obj


Function to extract the title


In [None]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    title = title_tag.text.strip() if title_tag else None
    return title


Function to extract the temporal expressions in the article

In [None]:
def extract_temporal_expressions(tag):
    body_tag = tag.find('body')
    
    if body_tag:
        content = body_tag.text.strip()
        doc = nlp(content)
        temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        return temporal_expressions
    else:
        return None 


Function to extract the authors of the article

In [None]:
def extract_authors(article_tag):
    authors_tags = article_tag.find_all('author')
    
    authors_list = []
    for author_tag in authors_tags:
        author_name = author_tag.text.strip()
        authors_list.append({"name": author_name})
    
    return authors_list if authors_list else None


Function to extract the georeferences in the article

In [None]:
def extract_georeferences(tag):
    content_tag = tag.find('body')
    content = content_tag.text.strip() if content_tag else None

    if content:
        doc = nlp(content)
        georeferences = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
        return georeferences
    else:
        return 

Function to extract and pre proccesing the content

In [None]:
def preprocess_body(article_tag):
    body_tag = article_tag.find('body')
    
    body_text = BeautifulSoup(str(body_tag), 'html.parser').get_text()

    # tokenize the content
    words = re.findall(r'\b\w+\b', body_text.lower())

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and len(word) >= 3]

    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in filtered_words]

    # join the processed words back into a string
    processed_body = ' '.join(stemmed_words)

    return processed_body
  

Function to convert the geo references into geopoints

In [None]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="your_user_agent_here")
    coordinates = []

    for georef in georeferences:
        if georef and isinstance(georef, str):
            location = geolocator.geocode(georef)
            if location and hasattr(location, 'latitude') and hasattr(location, 'longitude'):
                coordinates.append({
                    "lat": location.latitude,
                    "lon": location.longitude
                })

    return coordinates


Function to extract and index all attributes


In [None]:
def index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences):
    try:
        # Ensure that required fields have valid values before indexing
        if title and content:
            document = {
                'title': title,
                'content': content,
                'authors': authors,
                'date': date,
                'geopoint': geopoint,
                'temporalExpressions': temporal_expressions,
                'georeferences': georeferences
            }

            # Index the document
            es.index(index='new_index', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")

# Example usage:
# index_document("Sample Title", "Sample Content", ["Author1", "Author2"], "2023-01-01", "40.7128,-74.0060", "Some Temporal Expressions", "Some Georeferences")


In [None]:
def process_sgm_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            process_sgm_file(file_path)

def process_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    for reuters_tag in reuters_tags:      
        date=extract_date_of_publish(reuters_tag)
        authors=extract_authors(reuters_tag)
        title = extract_article_title(reuters_tag)
        body = preprocess_body(reuters_tag)
        georeferences = extract_georeferences(reuters_tag)
        #print(georeferences)
        temporal_expressions = extract_temporal_expressions(reuters_tag)
        if georeferences is not None:
            geopoints = extract_geopoints(georeferences)
        
        index_document(title, body, authors, date, geopoints, temporal_expressions, georeferences)        

data = "data"
process_sgm_folder(data)
