<a href="https://colab.research.google.com/github/sonia73b/tech400asst/blob/main/W3TECH400.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q scikit-learn pandas

import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
DOCS_FOLDER = "/content/drive/MyDrive/TECH400TXTFILES"

# Base folder for queries + results
BASE_FOLDER = "/content/TECH400W3ASST"

os.makedirs(DOCS_FOLDER, exist_ok=True)
os.makedirs(BASE_FOLDER, exist_ok=True)

# Output files
QUERIES_FILE       = os.path.join(BASE_FOLDER, "queries.txt")
QUERY_RESULTS_FILE = os.path.join(BASE_FOLDER, "query_results.txt")
QUERY_RESULTS_CSV  = os.path.join(BASE_FOLDER, "query_results_tfidf.csv")
DOC_SIM_TXT        = os.path.join(BASE_FOLDER, "doc_doc_similarity.txt")
DOC_SIM_CSV        = os.path.join(BASE_FOLDER, "doc_doc_similarity.csv")

In [4]:
queries = [
    "AI OR artificial",
    "oil AND surplus",
    "OPEC OR quota",
    "disease AND outbreak",
    "space AND mission",
    "education AND policy",
    "markets OR investors",
    "football OR tennis",
    "AI AND NOT oil",
    "disease AND NOT Marburg"
]

with open(QUERIES_FILE, "w", encoding="utf-8") as f:
    for q in queries:
        f.write(q + "\n")

print("Saved queries to:", QUERIES_FILE)

Saved queries to: /content/TECH400W3ASST/queries.txt


In [5]:
def load_documents(docs_folder):
    """
    Reads all .txt files from docs_folder.
    Returns:
        filenames: ['file1.txt', 'file2.txt', ...]
        texts:     ['full text of file1', 'full text of file2', ...]
    """
    file_paths = sorted(glob.glob(os.path.join(docs_folder, "*.txt")))
    filenames = [os.path.basename(p) for p in file_paths]
    texts = []
    for p in file_paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            texts.append(f.read())
    return filenames, texts

def load_queries(queries_file):
    """
    Reads queries from queries.txt (one per line).
    """
    with open(queries_file, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines()]
    return [q for q in lines if q]

filenames, doc_texts = load_documents(DOCS_FOLDER)
query_texts = load_queries(QUERIES_FILE)

print("Documents loaded:", filenames)
print("Queries loaded:", query_texts)


# Safety check
if len(doc_texts) == 0:
    raise RuntimeError("No .txt files found in DOCS_FOLDER. Check path and files.")
if len(query_texts) == 0:
    raise RuntimeError("No queries loaded from queries.txt. Check QUERIES_FILE.")

Documents loaded: ['Analysts_ai.txt', 'Capital_expenditure_for_AI.txt', 'Global_oil_markets_supply.txt', 'Health_authorities_in_Ethiopia.txt', 'NASA’s_ESCAPADE_mission.txt', 'Oil_prices_climbed_this_week.txt', 'UNESCO_Education_policy.txt', 'WHO_Disease_outbreak.txt']
Queries loaded: ['AI OR artificial', 'oil AND surplus', 'OPEC OR quota', 'disease AND outbreak', 'space AND mission', 'education AND policy', 'markets OR investors', 'football OR tennis', 'AI AND NOT oil', 'disease AND NOT Marburg']


In [6]:
corpus_for_fit = doc_texts + query_texts

vectorizer = TfidfVectorizer(
    lowercase=True,
    token_pattern=r"[A-Za-z0-9\-]+"
)

X = vectorizer.fit_transform(corpus_for_fit)

num_docs = len(doc_texts)
num_queries = len(query_texts)

# First part: documents, second part: queries
doc_matrix   = X[:num_docs, :]
query_matrix = X[num_docs:, :]

print("TF-IDF matrix shape (docs):", doc_matrix.shape)
print("TF-IDF matrix shape (queries):", query_matrix.shape)

TF-IDF matrix shape (docs): (8, 754)
TF-IDF matrix shape (queries): (10, 754)


In [7]:
QxD = cosine_similarity(query_matrix, doc_matrix)  # [Q x D]

rows = []
for qi, q in enumerate(query_texts):
    sims = QxD[qi]  # similarities of this query to all docs
    # rank documents by similarity (highest first)
    ranking = sorted(zip(filenames, sims), key=lambda t: t[1], reverse=True)
    for rank, (fname, score) in enumerate(ranking, start=1):
        rows.append({
            "query": q,
            "doc_rank": rank,
            "document": fname,
            "cosine_similarity": round(float(score), 4)
        })

results_df = pd.DataFrame(rows)
results_df.to_csv(QUERY_RESULTS_CSV, index=False, encoding="utf-8")
print("Saved ranked query–document results to:", QUERY_RESULTS_CSV)


Saved ranked query–document results to: /content/TECH400W3ASST/query_results_tfidf.csv


In [8]:
with open(QUERY_RESULTS_FILE, "w", encoding="utf-8") as f:
    for q in results_df["query"].unique():
        f.write("====================================\n")
        f.write(f"QUERY: {q}\n")
        f.write("====================================\n")
        sub = results_df[results_df["query"] == q].sort_values("doc_rank")
        for _, row in sub.iterrows():
            f.write(
                f"Rank {int(row['doc_rank'])}: "
                f"{row['document']} "
                f"(cosine={row['cosine_similarity']})\n"
            )
        f.write("\n")

print("Saved human-readable query results to:", QUERY_RESULTS_FILE)

Saved human-readable query results to: /content/TECH400W3ASST/query_results.txt


In [9]:
DxD = cosine_similarity(doc_matrix, doc_matrix)  # [D x D]

doc_sim_df = pd.DataFrame(DxD, index=filenames, columns=filenames)

# Save as CSV
doc_sim_df.to_csv(DOC_SIM_CSV, encoding="utf-8")
print("Saved document–document similarity CSV to:", DOC_SIM_CSV)

# Save as TXT
with open(DOC_SIM_TXT, "w", encoding="utf-8") as f:
    f.write("Document–Document Cosine Similarity (TF-IDF Vector Space Model)\n\n")
    f.write(doc_sim_df.to_string())

print("Saved document–document similarity TXT to:", DOC_SIM_TXT)


Saved document–document similarity CSV to: /content/TECH400W3ASST/doc_doc_similarity.csv
Saved document–document similarity TXT to: /content/TECH400W3ASST/doc_doc_similarity.txt


In [11]:
top3_rows = []

for q in results_df["query"].unique():
    sub = results_df[results_df["query"] == q].sort_values("doc_rank").head(3)

    for _, row in sub.iterrows():
        top3_rows.append({
            "Query": q,
            "Rank": int(row["doc_rank"]),
            "Document": row["document"],
            "Cosine Similarity": float(row["cosine_similarity"])
        })

top3_df = pd.DataFrame(top3_rows)

print("\nTop-3 documents per query (preview):")
print(top3_df.to_string(index=False))



Top-3 documents per query (preview):
                  Query  Rank                           Document  Cosine Similarity
       AI OR artificial     1                    Analysts_ai.txt             0.1218
       AI OR artificial     2     Capital_expenditure_for_AI.txt             0.0824
       AI OR artificial     3        UNESCO_Education_policy.txt             0.0265
        oil AND surplus     1      Global_oil_markets_supply.txt             0.1690
        oil AND surplus     2   Oil_prices_climbed_this_week.txt             0.1025
        oil AND surplus     3 Health_authorities_in_Ethiopia.txt             0.0641
          OPEC OR quota     1   Oil_prices_climbed_this_week.txt             0.0398
          OPEC OR quota     2        UNESCO_Education_policy.txt             0.0233
          OPEC OR quota     3     Capital_expenditure_for_AI.txt             0.0194
   disease AND outbreak     1           WHO_Disease_outbreak.txt             0.1733
   disease AND outbreak     2 Health_a