<a href="https://colab.research.google.com/github/subarna007/streamlitProjects/blob/main/Week_3_Lab_Report_Information_Retrieval_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Importing Required Libraries

In [10]:
from google.colab import files  # For file uploads in Google Colab
import re                      # For regex-based tokenization
import math                    # For log calculations (IDF)
from typing import List, Dict, Tuple

# Step 2: Uploading and Reading Documents

In [11]:
# Uploading files
uploaded = files.upload()

# Reading documents into a list
documents = []
for filename in uploaded.keys():
    with open(filename, 'r', encoding='utf-8') as file:
        documents.append(file.read())

Saving Apple Mac Creator Dies.txt to Apple Mac Creator Dies (1).txt
Saving argonaut_games_revival.txt to argonaut_games_revival (1).txt
Saving Art Search Technology.txt to Art Search Technology (1).txt
Saving BBC Interactive Bafta Wins.txt to BBC Interactive Bafta Wins (1).txt
Saving Blog Popularity in the US.txt to Blog Popularity in the US (1).txt
Saving bush_site_geo_block.txt to bush_site_geo_block (1).txt
Saving christmas_gadget_trends.txt to christmas_gadget_trends (1).txt
Saving Color-to-Music Software.txt to Color-to-Music Software (1).txt
Saving decline_of_home_phones.txt to decline_of_home_phones (1).txt
Saving doom_3_golden_joystick.txt to doom_3_golden_joystick (1).txt
Saving Employee Monitoring Software.txt to Employee Monitoring Software (1).txt
Saving EU Software Laws Delay.txt to EU Software Laws Delay (1).txt
Saving Half-Life 2 Bafta Awards.txt to Half-Life 2 Bafta Awards (1).txt
Saving high_definition_dvd.txt to high_definition_dvd (1).txt
Saving id_fraud_shadowcrew_a

# Step 3: Preprocessing Functions

In [12]:
# Stop words list
stop_words = {'the', 'is', 'in', 'and', 'of', 'a', 'to', ...}

def pre_process(text: str) -> List[str]:
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = re.findall(r'\b\w+\b', text)
    # Removing stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmed_tokens = []
    for word in filtered_tokens:
        if word.endswith('ing') and len(word) > 4:
            word = word[:-3]
        elif word.endswith('ed') and len(word) > 3:
            word = word[:-2]
        stemmed_tokens.append(word)
    return stemmed_tokens

# Step 4: Computing Term Frequencies (TF)

In [13]:
def compute_tf(doc_tokens: List[str]) -> Dict[str, int]:
    tf = {}
    for term in doc_tokens:
        tf[term] = tf.get(term, 0) + 1
    return tf

# Applying to all documents
processed_docs = [pre_process(doc) for doc in documents]
docs_tf = [compute_tf(tokens) for tokens in processed_docs]

# Step 5: Computing Document Frequencies (DF)

In [14]:
def compute_df(docs_tf: List[Dict[str, int]]) -> Dict[str, int]:
    df = {}
    for tf_dict in docs_tf:
        for term in tf_dict.keys():
            df[term] = df.get(term, 0) + 1
    return df

docs_df = compute_df(docs_tf)

# Step 6: Computing TF-IDF Vectors

In [15]:
def compute_tfidf_vector(tf: Dict[str, int], df: Dict[str, int], N: int) -> Dict[str, float]:
    tfidf = {}
    for term, freq in tf.items():
        if term in df and df[term] != 0:
            idf = math.log(N / df[term])  # IDF formula
            tfidf[term] = freq * idf
    # Normalizing with L2 norm
    norm = math.sqrt(sum(val**2 for val in tfidf.values()))
    return {term: val/norm for term, val in tfidf.items()}

N = len(docs_tf)
tfidf_vectors = [compute_tfidf_vector(tf, docs_df, N) for tf in docs_tf]

# Step 7: Query Processing & Ranking

In [16]:
def rank_documents(query: str, docs: List[str], k: int) -> List[Tuple[int, float]]:
    # Preprocessing query
    query_tokens = pre_process(query)
    query_tf = compute_tf(query_tokens)
    query_tfidf = compute_tfidf_vector(query_tf, docs_df, len(docs))

    # Computing cosine similarity with each document
    similarities = []
    for i, doc_vector in enumerate(tfidf_vectors):
        dot_product = sum(
            query_tfidf.get(term, 0.0) * doc_vector.get(term, 0.0)
            for term in query_tfidf
        )
        similarities.append((i, dot_product))

    # Returning top-k results
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

# Example Query:

In [18]:
ranked_docs = rank_documents("high income", documents, k=3)


# Step 8: Experimentation & Results

In [20]:
# Test queries
queries = ["Liquid helium", "Gradient System"]
for query in queries:
    results = rank_documents(query, documents, k=3)
    print(f"Query: '{query}'")
    for rank, (idx, score) in enumerate(results, 1):
        print(f"{rank}. Doc {idx+1} (Score: {score:.4f})")

Query: 'Liquid helium'
1. Doc 1 (Score: 0.0000)
2. Doc 2 (Score: 0.0000)
3. Doc 3 (Score: 0.0000)
Query: 'Gradient System'
1. Doc 11 (Score: 0.2076)
2. Doc 17 (Score: 0.0822)
3. Doc 39 (Score: 0.0790)


# Step 9: Experiment Setup

In [21]:
queries = [
    "More growth in the market is inevitable as new devices become available",
    "Microsoft has unveiled the finished version of its home-grown search engine",
    "The ESPN games are a touch more arcade-like in look and feel and are slightly easier to get into",
    "Moves to unite mobile and fixed phones look set to get more emphasis in 2005 too"
]

# Running Retrieval Pipeline

In [23]:
results = {}
for query in queries:
    ranked_docs = rank_documents(query, documents, k=3)  # Added documents and k=3
    results[query] = ranked_docs

# Step 10: Report Results

In [24]:
for query, ranked_docs in results.items():
    print(f"\nQuery: '{query}'")
    print("Top 3 Documents:")
    for rank, (doc_idx, score) in enumerate(ranked_docs, 1):
        doc_preview = documents[doc_idx][:100].replace('\n', ' ')  # Showing first 100 chars
        print(f"{rank}. Doc {doc_idx + 1} (Score: {score:.4f}): {doc_preview}...")


Query: 'More growth in the market is inevitable as new devices become available'
Top 3 Documents:
1. Doc 25 (Score: 0.1669): Millions buy MP3 players in US  One in 10 adult Americans - equivalent to 22 million people - owns a...
2. Doc 5 (Score: 0.0756): Blog reading explodes in America  Americans are becoming avid blog readers, with 32 million getting ...
3. Doc 36 (Score: 0.0751): The future in your pocket  If you are a geek or gadget fan, the next 12 months look like they are go...

Query: 'Microsoft has unveiled the finished version of its home-grown search engine'
Top 3 Documents:
1. Doc 18 (Score: 0.5105): Microsoft launches its own search  Microsoft has unveiled the finished version of its home-grown sea...
2. Doc 20 (Score: 0.2543): Search wars hit desktop PCs  Another front in the on-going battle between Microsoft and Google is ab...
3. Doc 31 (Score: 0.1923): Savvy searchers fail to spot ads  Internet search engine users are an odd mix of naive and sophistic...

Query: 'The 