<a href="https://colab.research.google.com/github/sahasbelbase/Information-Retrieval/blob/main/week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os
import re
import math
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [8]:
STOP_WORDS = set(stopwords.words('english'))
WORD_LEMMATIZER = WordNetLemmatizer()

In [9]:
def read_documents_from_folder(folder):
    all_text = []
    all_filenames = []

    for file_name in os.listdir(folder):
        if file_name.endswith(".txt"):
            all_filenames.append(file_name)
            with open(os.path.join(folder, file_name), 'r', encoding='utf-8') as file:
                file_content = file.read()
                all_text.append(file_content)

    return all_filenames, all_text

In [10]:
def process_document_text(text_content):
    """Cleans and processes text, removing unnecessary parts and breaking it down."""
    text_content = text_content.lower()
    text_content = re.sub(r"[^a-zA-Z0-9\s]", "", text_content)
    text_content = re.sub(r"\d+", "", text_content)
    word_tokens = word_tokenize(text_content)
    filtered_words = [WORD_LEMMATIZER.lemmatize(word) for word in word_tokens if word not in STOP_WORDS]

    return filtered_words

In [11]:
def process_query_text(query_content):
    """Processes the query text but retains stop words for meaningful search."""
    query_content = query_content.lower()
    query_content = re.sub(r"[^a-zA-Z0-9\s]", "", query_content)
    query_content = re.sub(r"\d+", "", query_content)
    query_tokens = word_tokenize(query_content)
    lemmatized_query = [WORD_LEMMATIZER.lemmatize(word) for word in query_tokens]

    return lemmatized_query

In [12]:
print(os.listdir('/content/drive/MyDrive/Westcliff/IR'))

['week3.ipynb', 'week_2__sahas.ipynb', 'WaronRussiaandUkrain.txt', 'WarRUs.txt', 'Ukrain.txt', 'GlobalView.txt', 'Putin.txt', 'HelloRussia.txt']


In [13]:
folder_path = '/content/drive/MyDrive/Westcliff/IR'


document_filenames, document_texts = read_documents_from_folder(folder_path)
processed_docs = [process_document_text(text) for text in document_texts]

word_vocab = set([word for doc in processed_docs for word in doc])
sorted_vocab = sorted(word_vocab)

print(word_vocab)


{'mineral', 'say', 'unsuccessful', 'opposition', 'towards', 'energy', 'destroyed', 'battalion', 'january', 'kickstart', 'lavrov', 'estimate', 'april', 'die', 'adjacent', 'bank', 'back', 'assault', 'considering', 'highlight', 'live', 'settlement', 'nezalezhnosti', 'compounded', 'direct', 'formally', 'rafael', 'break', 'impose', 'valerii', 'handed', 'led', 'took', 'opted', 'south', 'twentyone', 'showed', 'explored', 'steinbach', 'water', 'described', 'displaced', 'pretext', 'division', 'commit', 'mainly', 'departed', 'use', 'disputed', 'announced', 'keir', 'get', 'consequence', 'caused', 'putin', 'borin', 'lukashenko', 'product', 'peacekeeping', 'subsequent', 'donetsk', 'stalled', 'sell', 'sector', 'pohorily', 'weve', 'agreement', 'greater', 'prepared', 'analyst', 'performance', 'kazakhstan', 'bombard', 'hirnyk', 'utilizes', 'public', 'note', 'skirmish', 'invasion', 'peninsula', 'part', 'stage', 'damage', 'called', 'shift', 'egmont', 'reached', 'approved', 'department', 'quickly', 'vulne

In [14]:
def calculate_term_frequency(word, document):
    return document.count(word) / len(document)

In [15]:
def calculate_inverse_document_frequency(word, all_docs):
    docs_with_word = sum(1 for doc in all_docs if word in doc)
    return math.log(len(all_docs) / (1 + docs_with_word))

In [16]:
def generate_tfidf_vector(doc, all_docs, vocab):
    tfidf_values = []
    for word in vocab:
        tf_value = calculate_term_frequency(word, doc)
        idf_value = calculate_inverse_document_frequency(word, all_docs)
        tfidf_values.append(tf_value * idf_value)
    return np.array(tfidf_values)

In [17]:
def calculate_cosine_similarity(vec1, vec2):
    dot_prod = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_prod / (norm1 * norm2)

In [18]:
search_queries = [
    "Russia's invasion of Ukraine 2022",
    "Impact of Russia-Ukraine war on global trade",
    "Ukraine's counteroffensive efforts and Russia's response",
    "Sanctions imposed on Russia during Ukraine war",
    "Economic consequences of the war for Ukraine and Russia"
]

In [19]:
processed_queries = [process_query_text(query) for query in search_queries]

document_tfidf_vectors = [generate_tfidf_vector(doc, processed_docs, sorted_vocab) for doc in processed_docs]
query_tfidf_vectors = [generate_tfidf_vector(query, processed_docs, sorted_vocab) for query in processed_queries]


In [20]:
similarities_output = []
for query_vector in query_tfidf_vectors:
    similarity_scores = [calculate_cosine_similarity(query_vector, doc_vector) for doc_vector in document_tfidf_vectors]
    similarities_output.append(similarity_scores)

output_filename = 'sahas_results.txt'
with open(output_filename, 'w') as output_file:
    for i, query in enumerate(search_queries):
        output_file.write("\n")
        output_file.write("---------------------------------------------------------\n")
        output_file.write(f"Similarities for query: \"{query}\":\n")
        output_file.write("---------------------------------------------------------\n")
        doc_similarity_scores = [(document_filenames[j], similarities_output[i][j]) for j in range(len(document_texts))]
        sorted_doc_scores = sorted(doc_similarity_scores, key=lambda x: x[1], reverse=True)
        for doc_name, similarity_score in sorted_doc_scores:
            output_file.write(f"{doc_name}: {similarity_score:.4f}\n")

list_output_filename = 'similarity_output_list.txt'
with open(list_output_filename, 'w') as list_output_file:
    for similarity_set in similarities_output:
        list_output_file.write(f"{similarity_set}\n")

with open(output_filename, 'r') as file:
    print(file.read())


---------------------------------------------------------
Similarities for query: "Russia's invasion of Ukraine 2022":
---------------------------------------------------------
HelloRussia.txt: 0.1438
WaronRussiaandUkrain.txt: 0.0911
WarRUs.txt: 0.0779
Ukrain.txt: 0.0499
GlobalView.txt: 0.0446
Putin.txt: 0.0178

---------------------------------------------------------
Similarities for query: "Impact of Russia-Ukraine war on global trade":
---------------------------------------------------------
GlobalView.txt: 0.5357
WaronRussiaandUkrain.txt: 0.0092
HelloRussia.txt: 0.0068
WarRUs.txt: 0.0000
Ukrain.txt: 0.0000
Putin.txt: 0.0000

---------------------------------------------------------
Similarities for query: "Ukraine's counteroffensive efforts and Russia's response":
---------------------------------------------------------
HelloRussia.txt: 0.1146
WarRUs.txt: 0.0248
WaronRussiaandUkrain.txt: 0.0094
Ukrain.txt: 0.0083
GlobalView.txt: 0.0057
Putin.txt: 0.0023

-----------------------