In [17]:
!pip install scikit-learn -q

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("All libraries imported successfully!")

All libraries imported successfully!


In [18]:
from google.colab import files
import io

print("="*80)
print("UPLOAD YOUR DOCUMENTS")
print("="*80)
print("\nClick 'Choose Files' and select ALL your document files")
print("You can select multiple files at once (Ctrl+Click)")
print("="*80)

# Upload files
uploaded = files.upload()

print(f"\n Uploaded {len(uploaded)} file(s)")
for filename in uploaded.keys():
    print(f"  - {filename}")

UPLOAD YOUR DOCUMENTS

Click 'Choose Files' and select ALL your document files
You can select multiple files at once (Ctrl+Click)


Saving D6.txt to D6 (2).txt
Saving D5.txt to D5 (2).txt
Saving D4.txt to D4 (2).txt
Saving D3.txt to D3 (2).txt
Saving D2.txt to D2 (2).txt
Saving D1.txt to D1 (2).txt

 Uploaded 6 file(s)
  - D6 (2).txt
  - D5 (2).txt
  - D4 (2).txt
  - D3 (2).txt
  - D2 (2).txt
  - D1 (2).txt


In [19]:
documents = {}

print("\n" + "="*80)
print("LOADING DOCUMENTS")
print("="*80)

for filename, content in uploaded.items():
    try:
        text_content = content.decode('utf-8')
        documents[filename] = text_content
        print(f" Loaded: {filename} ({len(text_content)} characters)")
    except UnicodeDecodeError:
        try:
            text_content = content.decode('latin-1')
            documents[filename] = text_content
            print(f" Loaded: {filename} ({len(text_content)} characters) [latin-1]")
        except Exception as e:
            print(f"âœ— Error: {e}")

print(f"\n{'='*80}")
print(f"Total documents loaded: {len(documents)}")
print(f"{'='*80}")


LOADING DOCUMENTS
 Loaded: D6 (2).txt (11693 characters)
 Loaded: D5 (2).txt (10414 characters)
 Loaded: D4 (2).txt (7614 characters)
 Loaded: D3 (2).txt (11614 characters)
 Loaded: D2 (2).txt (11614 characters)
 Loaded: D1 (2).txt (7658 characters)

Total documents loaded: 6


In [20]:
queries = [
    "IMF global growth forecast 2025"
    "Chinese hackers Claude AI cyberattacks"
    "renewable energy stop fossil fuel growth"
    "NASA ESCAPADE Mars mission"
    "AI Crohn's disease treatment discovery"
    "OpenAI Microsoft AGI restrictions"
    "AI powered cyberattacks 2025"
    "renewable energy investment 2025"
    "AI medical research drug discovery"
    "technology companies AI competition"
]

print("="*80)
print("QUERIES")
print("="*80)
print(f"\nTotal queries: {len(queries)}\n")
for i, query in enumerate(queries, 1):
    print(f"{i}. {query}")

QUERIES

Total queries: 1

1. IMF global growth forecast 2025Chinese hackers Claude AI cyberattacksrenewable energy stop fossil fuel growthNASA ESCAPADE Mars missionAI Crohn's disease treatment discoveryOpenAI Microsoft AGI restrictionsAI powered cyberattacks 2025renewable energy investment 2025AI medical research drug discoverytechnology companies AI competition


In [21]:
def compute_similarity_and_rank(queries, documents):
    if not documents:
        print("Error: No documents!")
        return

    doc_names = list(documents.keys())
    doc_contents = list(documents.values())

    print(f"\n{'='*80}")
    print(f"PROCESSING")
    print(f"{'='*80}")
    print(f"Documents: {len(doc_names)}")
    print(f"Queries: {len(queries)}")

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        max_features=5000
    )

    print("\nComputing TF-IDF vectors...")
    doc_tfidf = vectorizer.fit_transform(doc_contents)
    print(f" TF-IDF matrix shape: {doc_tfidf.shape}")

    all_results = []

    for i, query in enumerate(queries, 1):
        print(f"\n{'='*80}")
        print(f"Query {i}: {query}")
        print(f"{'='*80}\n")

        query_tfidf = vectorizer.transform([query])
        similarities = cosine_similarity(query_tfidf, doc_tfidf)[0]

        doc_scores = list(zip(doc_names, similarities))
        ranked_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)

        print("Ranked Documents (by relevance):")
        print("-" * 80)

        for rank, (doc_name, score) in enumerate(ranked_docs, 1):
            print(f"{rank}. {doc_name:40s} | Similarity: {score:.4f}")

        all_results.append((query, ranked_docs))
        print()

    return all_results

# Run ranking
results = compute_similarity_and_rank(queries, documents)


PROCESSING
Documents: 6
Queries: 1

Computing TF-IDF vectors...
 TF-IDF matrix shape: (6, 1829)

Query 1: IMF global growth forecast 2025Chinese hackers Claude AI cyberattacksrenewable energy stop fossil fuel growthNASA ESCAPADE Mars missionAI Crohn's disease treatment discoveryOpenAI Microsoft AGI restrictionsAI powered cyberattacks 2025renewable energy investment 2025AI medical research drug discoverytechnology companies AI competition

Ranked Documents (by relevance):
--------------------------------------------------------------------------------
1. D5 (2).txt                               | Similarity: 0.2667
2. D6 (2).txt                               | Similarity: 0.2555
3. D3 (2).txt                               | Similarity: 0.2385
4. D2 (2).txt                               | Similarity: 0.2385
5. D4 (2).txt                               | Similarity: 0.2013
6. D1 (2).txt                               | Similarity: 0.1718



In [22]:
output_file = "ranking_results.txt"

with open(output_file, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write("Document Ranking Results using TF-IDF and Cosine Similarity\n")
    f.write("="*80 + "\n\n")

    for i, (query, ranked_docs) in enumerate(results, 1):
        f.write(f"\n{'='*80}\n")
        f.write(f"Query {i}: {query}\n")
        f.write(f"{'='*80}\n\n")
        f.write("Ranked Documents (by relevance):\n")
        f.write("-" * 80 + "\n")

        for rank, (doc_name, score) in enumerate(ranked_docs, 1):
            f.write(f"{rank}. {doc_name:40s} | Similarity: {score:.4f}\n")
        f.write("\n")

    f.write("\n" + "="*80 + "\n")
    f.write("End of Results\n")
    f.write("="*80 + "\n")

print(f"{'='*80}")
print(f" Results saved to: {output_file}")
print(f"{'='*80}")

 Results saved to: ranking_results.txt


In [24]:
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"\nTotal Documents: {len(documents)}")
print(f"Total Queries: {len(queries)}")
print(f"Total Comparisons: {len(documents) * len(queries)}")

total_chars = sum(len(content) for content in documents.values())
avg_chars = total_chars // len(documents) if documents else 0
print(f"\nTotal Characters: {total_chars:,}")
print(f"Average per Document: {avg_chars:,}")
print("="*80)



SUMMARY STATISTICS

Total Documents: 6
Total Queries: 1
Total Comparisons: 6

Total Characters: 60,607
Average per Document: 10,101
