Importing necesssary libraries

In [39]:
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
import spacy
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from geopy.geocoders import Nominatim
from datetime import timedelta

defining an es object

In [2]:

nlp = spacy.load('en_core_web_sm')
es = Elasticsearch(['http://localhost:9200/'])


Creating the mapping and the setting for the index

In [20]:
index_mapping = {
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "autocomplete",
            },
            "content": {
                "type": "text",
            },
            "authors": {
                "type": "nested",
                "properties": {
                    "name": {"type": "text"},
                },
            },
            "date": {
                "type": "date",
            },
            "geopoint": {
                "type": "geo_point",
            },
            "temporalExpressions": {
                "type": "text",
            },
            "georeferences": {
                "type": "text",
            },
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "autocomplete": {
                    "tokenizer": "autocomplete",
                    "filter": ["lowercase"]
                },
                "autocomplete_search": {
                    "tokenizer": "lowercase"
                }
            },
            "tokenizer": {
                "autocomplete": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 25,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    }
}
index_name = "test"
es.indices.create(index=index_name, body=index_mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'})

Function to extract the of publishing the article

In [3]:
from datetime import datetime

def extract_date_of_publish(reuters_tag):
    date_obj = None
    
    date_tags = reuters_tag.find_all('date')
    
    if date_tags:
        date_tag = date_tags[0]
        date_str = date_tag.text.strip()

        try:
            date_obj = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S.%f")
        except ValueError:
            # Handle the case where the date string is not in the expected format
            print(f"Error: Unable to parse date string '{date_str}'")
    
    return date_obj


Function to extract the title


In [4]:
def extract_article_title(reuters_tag):
    title_tag = reuters_tag.find('title')
    title = title_tag.text.strip() if title_tag else None
    return title


Function to extract the temporal expressions in the article

In [5]:
def extract_temporal_expressions(tag):
    body_tag = tag.find('body')
    
    if body_tag:
        content = body_tag.text.strip()
        doc = nlp(content)
        temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        return temporal_expressions
    else:
        return None 


Function to extract the authors of the article

In [6]:
def extract_authors(article_tag):
    authors_tags = article_tag.find_all('author')
    
    authors_list = []
    for author_tag in authors_tags:
        author_name = author_tag.text.strip()
        authors_list.append({"name": author_name})
    
    return authors_list if authors_list else None


Function to extract the georeferences in the article

In [5]:
def extract_georeferences(tag):
    content_tag = tag.find('body')
    content = content_tag.text.strip() if content_tag else None

    if content:
        doc = nlp(content)
        georeferences = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
        return georeferences
    else:
        return 

Function to extract and pre proccesing the content

In [7]:
def preprocess_body(article_tag):
    body_tag = article_tag.find('body')
    
    body_text = BeautifulSoup(str(body_tag), 'html.parser').get_text()

    # tokenize the content
    words = re.findall(r'\b\w+\b', body_text.lower())

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words and len(word) >= 3]

    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in filtered_words]

    # join the processed words back into a string
    processed_body = ' '.join(stemmed_words)

    return processed_body
  

Function to convert the geo references into geopoints

In [8]:
def extract_geopoints(georeferences):
    geolocator = Nominatim(user_agent="your_user_agent_here")
    coordinates = []

    for georef in georeferences:
        if georef and isinstance(georef, str):
            location = geolocator.geocode(georef)
            if location and hasattr(location, 'latitude') and hasattr(location, 'longitude'):
                coordinates.append({
                    "lat": location.latitude,
                    "lon": location.longitude
                })

    return coordinates


Function to extract and index all attributes


In [9]:
def index_document(title, content, authors, date, geopoint, temporal_expressions, georeferences):
    try:
        # Ensure that required fields have valid values before indexing
        if title and content:
            document = {
                'title': title,
                'content': content,
                'authors': authors,
                'date': date,
                'geopoint': geopoint,
                'temporalExpressions': temporal_expressions,
                'georeferences': georeferences
            }

            # Index the document
            es.index(index='test', body=document)
            print(f"Document indexed successfully: {title}")
        else:
            print("Skipping document due to missing required fields.")
    except Exception as e:
        print(f"Error indexing document: {e}")



In [22]:
def process_sgm_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".sgm"):
            file_path = os.path.join(folder_path, filename)
            process_sgm_file(file_path)

def process_sgm_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    soup = BeautifulSoup(content, 'html.parser')
    reuters_tags = soup.find_all('reuters')
    for reuters_tag in reuters_tags:      
        date=extract_date_of_publish(reuters_tag)
        authors=extract_authors(reuters_tag)
        title = extract_article_title(reuters_tag)
        body = preprocess_body(reuters_tag)
        georeferences = extract_georeferences(reuters_tag)
        temporal_expressions = extract_temporal_expressions(reuters_tag)
        if georeferences is not None:
            geopoints = extract_geopoints(georeferences)
        
        index_document(title, body, authors, date, geopoints, temporal_expressions, georeferences)        

data = "data"
process_sgm_folder(data)


Document indexed successfully: BAHIA COCOA REVIEW
Document indexed successfully: STANDARD OIL <SRD> TO FORM FINANCIAL UNIT
Document indexed successfully: TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN
Document indexed successfully: TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER
Document indexed successfully: NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE
Document indexed successfully: ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS
Document indexed successfully: RED LION INNS FILES PLANS OFFERING
Document indexed successfully: USX <X> DEBT DOWGRADED BY MOODY'S
Document indexed successfully: CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT
Document indexed successfully: COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE
Document indexed successfully: COBANCO INC <CBCO> YEAR NET
Document indexed successfully: OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET
Document indexed successfully: AM INTERNATIONAL INC <AM> 2ND QTR JAN 31
Document indexed successfully: BROWN-FORMAN INC <BFD> 4TH QTR NET
Document i

KeyboardInterrupt: 

In [234]:
from spellchecker import SpellChecker

spell_checker = SpellChecker()

# Function to correct spelling in a sentence
def correct_spelling(sentence):
    corrected_words = [spell_checker.correction(word) for word in sentence.split()]
    return ' '.join(corrected_words)

# Example user input
user_input = "coce cola"

# Correct spelling in the user input
corrected_query = correct_spelling(user_input)

# Check if the corrected query is at least three characters
if len(corrected_query) >= 3:
    autocomplete_query = {
        "query": {
            "match": {
                "title": {
                    "query": corrected_query,
                    "fuzziness": "AUTO",  # Enable fuzzy matching
                    "prefix_length": 3,   # Adjust as needed
                }
            }
        }
    }

    autocomplete_result = es.search(index="test", body=autocomplete_query)

    # Print ranked titles and scores
    for idx, hit in enumerate(autocomplete_result["hits"]["hits"], start=1):
        title = hit["_source"]["title"]
        score = hit["_score"]

        print(f"{idx}- Title: {title}, Score: {score}")
else:
    print("User input is less than three characters. No autocomplete suggestions.")


1- Title: COCA COLA <KO> UNIT AND WORLD FILM IN VENTURE, Score: 33.69805
2- Title: BAHIA COCOA REVIEW, Score: 23.156023
3- Title: COLECO INDUSTRIES INC <CLO> 4TH QTR, Score: 19.444569
4- Title: INDONESIAN TEA, COCOA EXPORTS SEEN UP, COFFEE DOWN, Score: 18.78078
5- Title: COLECO INDUSTRIES <CLC> SEES PROFIT IN 1987, Score: 17.946524
6- Title: COLUMBIA GAS SYSTEM INC <CG> REDEEMS DEBENTURES, Score: 17.190845
7- Title: CORADIAN CORP <CDIN> 4TH QTR NET, Score: 6.8717756
8- Title: COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE, Score: 6.1670065
9- Title: COMPANIES SET BID FOR CANADA HELICOPTER CONTRACT, Score: 6.0839787
10- Title: FLUOR <FLR> UNIT GETS CONSTRUCTION CONTRACT, Score: 6.0172753


In [20]:
# Define the Elasticsearch aggregation query
aggregation_query = {
    "aggs": {
        "top_georeferences": {
            "terms": {
                "field": "georeferences",
                "size": 10  # Return top 10 georeferences
            }
        }
    }
}

# Execute the aggregation query
aggregation_result = es.search(index="test", body=aggregation_query)
aggregation_result["aggregations"]["top_georeferences"]["buckets"]


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'Fielddata is disabled on [georeferences] in [test]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [georeferences] in order to load field data by uninverting the inverted index. Note that this can use significant memory.')

In [21]:
# Define the Elasticsearch date histogram aggregation query
date_histogram_query = {
    "aggs": {
        "documents_over_time": {
            "date_histogram": {
                "field": "date",
                "calendar_interval": "1d"  # Aggregation interval of 1 day
            }
        }
    }
}

# Execute the date histogram aggregation query
date_histogram_result = es.search(index="your_index_name", body=date_histogram_query)
date_histogram_result["aggregations"]["documents_over_time"]["buckets"]


NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [your_index_name]', your_index_name, index_or_alias)