In [1]:
#Importing Libraries
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
import re
import string
from tqdm import tqdm

In [2]:
#Function to parse the documents
import chardet

def get_encoding_type(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    return chardet.detect(raw_data)['encoding']


def parse_file(file_path):
    extracted_dict = {}
    
    try:
        with open(file_path, 'r', encoding='ascii', errors='replace') as file:
            file_content = file.read()

        
        docs = re.findall(r'<DOC>(.*?)</DOC>', file_content, re.DOTALL)

       
        for doc in docs:
           
            docno = re.search(r'<DOCNO>(.*?)</DOCNO>', doc, re.DOTALL)
            if docno:
                docno = docno.group(1).strip()
                texts = re.findall(r'<TEXT>(.*?)</TEXT>', doc, re.DOTALL)
                texts = [text.strip() for text in texts]
                extracted_dict[docno] = texts
    except UnicodeDecodeError as e:
        print(f"Error reading {file_path}: {e}")
        return None          

    return extracted_dict

In [4]:
folder = "./ap89_collection"

text_map = {}

for filename in os.listdir(folder):
    if filename != 'readme':
        file_path = os.path.join(folder, filename)
        text_map.update(parse_file(file_path))
    
        

In [5]:
i = 0
for keys, values in text_map.items():
    print(keys)
    print(values)
    if(i == 5):
        break
    i += 1    

AP891220-0001
["Instead of collapsing when the United\nStates threw its military might against Gen. Manuel Antonio\nNoriega, his Defense Forces stubbornly resisted.\n   The elusive general the Bush administration had hoped to put\nbefore a federal judge in Florida on drug charges remains at large\nand in hiding despite an attack Wednesday by thousands of American\ntroops.\n   Many in Panama had thought the Defense Forces would simply\n``throw down their guns as soon as an American helicopter appeared\nover the barracks,'' as a Noriega opponent once put it.\n   In interviews prior to the attack, U.S. military leaders had\nclaimed Defense Forces troops would refuse to shed their blood for\nNoriega. Part of their evidence stemmed from the fact that disloyal\nofficers mounted two coups against him.\n   But the general, who sometimes seemed to escape by magic,\napparently retained enough loyalty for his men to put up a fight\nwhen the U.S. stroke came. From Defense Forces posts all over\nPa

In [6]:
len(text_map)

84678

In [7]:
def preprocess_text(text_map):
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()
    
    with open('stoplist.txt', 'r') as file:
        stopwords = set(file.read().strip().splitlines())
    

    for key, texts in tqdm(text_map.items(), desc = "Processing"):
        preprocessed_texts = []
        for text in texts:

            tokens = word_tokenize(text)
            for word in tokens:
                if '-' in word:
                    hyphenated_words = word.split('-')
                    preprocessed_texts.extend(hyphenated_words)
                else:
                    preprocessed_texts.append(word)
                    
            preprocessed_texts = [stemmer.stem(word) for word in preprocessed_texts]    
            preprocessed_texts = [word for word in preprocessed_texts if word.lower() not in stopwords and word.strip(string.punctuation)]
   
        text_map[key] = ' '.join(preprocessed_texts)
    
    return text_map

In [None]:
preprocessed_text_map = preprocess_text(text_map)
print("Preprocessing Done")

In [None]:
#!pip install elasticsearch7

In [None]:
from elasticsearch7 import Elasticsearch
es = Elasticsearch("http://localhost:9200")
print(es.ping())

In [None]:
index_name = "ap89_data5"

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords_path": "my_stoplist.txt"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [None]:
es.indices.create(index = index_name, body = configurations)

In [None]:
def add_data(_id, text):
    
    es.index(
        index = index_name,
        document = {
            'content': text,
        }, id = _id)

In [None]:
for key in tqdm(preprocessed_text_map, desc = "Adding index"):
    add_data(key, preprocessed_text_map[key])
    
print("All documents are added to the index")    