# Task 2

# Document Indexing

In [1]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Read all files and parse doc ID and text

In [2]:
def stem_text(text, ps):
    stemmed = ' '.join([ps.stem(word) for word in text.split()])
    return stemmed

In [3]:
text_map = {}
ps = PorterStemmer()

def parse_file(file_path):
    current_doc_no = None
    reading_text = False
    doc_text = ""
    
    # parse the doc to get doc no and corresponding text
    with open(file_path, 'r', encoding='ISO-8859-1', errors='ignore') as file:
        lines = file.readlines()

    for line in lines:
        if "<DOCNO>" in line:
            current_doc_no = line.strip().replace('<DOCNO>', '').replace('</DOCNO>', '')
        elif "<TEXT>" in line:
            reading_text = True
        elif "</TEXT>" in line:
            reading_text = False
        elif reading_text:
            doc_text += line.strip() + ' '
        elif "</DOC>" in line:
            if current_doc_no is not None:
                stemmed_text = stem_text(doc_text, ps)
                text_map[current_doc_no.strip()] = stemmed_text.strip()
                doc_text = "" 

In [38]:
folder = "./hw1-pramathabhat-main/IR_data/AP_DATA/ap89_collection"

for filename in os.listdir(folder):
    if filename != 'readme':
        file_path = os.path.join(folder, filename)
        parse_file(file_path)
        
print("Parsing completed")

Parsing completed


In [39]:
#total no of docs
print(len(text_map))

84678


In [40]:
#save doc id into a list
docnos = list(text_map.keys())
docnos[0]

first_doc_value = next(iter(text_map.values()), None)

first_doc_value

"the celluloid torch ha been pass to a new generation: filmmak who grew up in the 1960s. ``platoon,'' ``run on empty,'' ``1969'' and ``mississippi burning'' are among the movi releas in the past two year from writer and director who brought their own experi of that turbul decad to the screen. ``the contemporari of the '60 are some of the filmmak of the '80s. it' natural,'' said robert friedman, the senior vice presid of worldwid advertis and public at warner bros. chri gerolmo, who wrote the screenplay for ``mississippi burning,'' note that the sheer passag of time ha allow him and other to express their feel about the decade. ``distanc is important,'' he said. ``i believ there' a lot of think about that time and america in general.'' the vietnam war wa a defin experi for mani peopl in the '60s, shatter the consensu that the unit state had a right, even a moral duti to interven in conflict around the world. even today, politician talk disparagingli of the ``vietnam syndrome'' in refer 

# Read stopwords into list

In [41]:
sw_path = "./hw1-pramathabhat-main/config/stoplist.txt"

with open(sw_path) as file:
    stopwords = file.read().splitlines()

print(len(stopwords))
#for i, stopword in enumerate(stopwords, start=1):
#    print(f"{i}. {stopword}")

418


# Remove stop words and punctuations from content

In [42]:
import string

def process_content(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stopwords])
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

In [43]:
processed_text_map = {doc_id: process_content(content) for doc_id, content in text_map.items()}

docnos = list(processed_text_map.keys())
first_doc_id = docnos[0]
first_doc_processed_content = processed_text_map[first_doc_id]

print(f"Document ID: {first_doc_id}")
print(f"Processed Content of the First Document: {first_doc_processed_content}")

Document ID: AP890101-0001
Processed Content of the First Document: celluloid torch ha pass new generation: filmmak grew 1960s. ``platoon,'' ``run empty,'' ``1969'' ``mississippi burning'' movi releas past two writer director brought experi turbul decad screen. ``the contemporari '60 filmmak '80s. it' natural,'' robert friedman, senior vice presid worldwid advertis public warner bros. chri gerolmo, wrote screenplay ``mississippi burning,'' note sheer passag time ha allow express feel decade. ``distanc important,'' said. ``i believ there' lot think time america general.'' vietnam war wa defin experi mani peopl '60s, shatter consensu unit state right, moral duti interven conflict world. today, politician talk disparagingli ``vietnam syndrome'' refer country' reluct militari forc settl disputes. ``i think futur historian talk vietnam near destruct american society,'' uri brofenbrenner, professor sociolog cornel university. ``in world war ii, knew fight for, vietnam.'' ``full metal jacket,

# ElasticSearch

In [44]:
from elasticsearch7 import Elasticsearch

es = Elasticsearch("http://localhost:9200")

print(es.ping())

True


# Create index

In [45]:
index_name = "ap89_data4"

configurations = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stopwords
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions"
            }
        }
    }
}

In [17]:
es.indices.delete(index=index_name)

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [ap89_data1]', ap89_data1, index_or_alias)

In [46]:
es.indices.create(index=index_name, body=configurations)

  es.indices.create(index=index_name, body=configurations)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ap89_data3'}

In [47]:
def add_data(_id, text):
    es.index(index=index_name, document={'content': text}, id = _id)

In [48]:
for key in processed_text_map:
    add_data(key, processed_text_map[key])
    
print("All documents have been added to the index")

All documents have been added to the index
