In [1]:
#1 Core libraries
import sys
import os
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Sentence Transformers for embeddings
from sentence_transformers import SentenceTransformer

# Hugging Face Transformers for LLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline





In [2]:
#2 Load the text file
with open("player_injuries.txt", "r") as f:
    kb_text = f.read()

lines = kb_text.strip().split("\n")
kb_chunks = lines[1:]  # skip header
print(kb_chunks[:3])    # preview


['Richard Sherman,Cornerback,2015-01-18,Elbow sprain,Unknown,Full,Probable,No,"Sustained elbow sprain vs. Packers; cleared and expected to play full speed."', 'Earl Thomas,Free Safety,2015-01-18,Shoulder dislocation,Unknown,Full,Probable,No,"Dislocated shoulder in NFC Championship; practicing with brace."', 'Kam Chancellor,Strong Safety,2015-01-30,Knee ligament strain,Unknown,Limited,Probable,No,"Suffered knee injury during Friday practice before Super Bowl; still active."']


In [3]:
#3
model = SentenceTransformer("all-MiniLM-L6-v2")


In [4]:
#4
kb_embeddings = model.encode(kb_chunks, normalize_embeddings=True)
print("Embedding shape:", kb_embeddings.shape)


Embedding shape: (7, 384)


In [5]:
#5 Build Vector Store
vector_store = {
    "texts": kb_chunks,
    "embeddings": kb_embeddings
}
print("Vector store created.")


Vector store created.


In [6]:
#6 Retrieval Function

def retrieve_chunks(query, model, vector_store, top_k=2):
    q_emb = model.encode([query], normalize_embeddings=True)
    scores = cosine_similarity(q_emb, vector_store["embeddings"])[0]
    top_idx = np.argsort(scores)[::-1][:top_k]

    return [(idx, vector_store["texts"][idx], scores[idx]) for idx in top_idx]


In [7]:
#7 Load LLM (T5 or Llama)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
llm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


In [8]:
#8 Generation Function
def generate_answer(query, retrieved_chunks):
    context = "\n".join([chunk for _, chunk, _ in retrieved_chunks])

    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = llm.generate(input_ids, max_length=150)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [9]:
#9 Test Cases
test_cases = [
    "Which player has a shoulder injury?",
    "Who won the first Super Bowl?",
    "Which players had knee issues before the Super Bowl?"
]

for i, query in enumerate(test_cases, 1):
    retrieved = retrieve_chunks(query, model, vector_store)
    answer = generate_answer(query, retrieved)

    print(f"Test Case {i}: {query}")
    print("Answer:", answer)
    print("-" * 60)


Test Case 1: Which player has a shoulder injury?
Answer: Earl Thomas
------------------------------------------------------------
Test Case 2: Who won the first Super Bowl?
Answer: Earl Thomas
------------------------------------------------------------
Test Case 3: Which players had knee issues before the Super Bowl?
Answer: Kam Chancellor
------------------------------------------------------------


In [10]:
# 10
# Test 1: Factual question (answer is in your KB)
query1 = "Which player has a shoulder injury?"
retrieved1 = retrieve_chunks(query1, model, vector_store)
answer1 = generate_answer(query1, retrieved1)

print("ðŸŸ¦ Test Case 1 â€” Factual")
print("Query:", query1)
print("Retrieved Context:")
for c in retrieved1:
    print("-", c[1])
print("Answer:", answer1)
print("\n" + "="*60 + "\n")

# Test 2: Foil question (NOT in KB)
query2 = "Who won the Super Bowl in 2020?"
retrieved2 = retrieve_chunks(query2, model, vector_store)
answer2 = generate_answer(query2, retrieved2)

print("ðŸŸ© Test Case 2 â€” Foil (not in KB)")
print("Query:", query2)
print("Retrieved Context:")
for c in retrieved2:
    print("-", c[1])
print("Answer:", answer2)
print("\n" + "="*60 + "\n")

# Test 3: Synthesis question (requires combining KB lines)
query3 = "Which players had knee issues before the Super Bowl?"
retrieved3 = retrieve_chunks(query3, model, vector_store)
answer3 = generate_answer(query3, retrieved3)

print("ðŸŸ§ Test Case 3 â€” Synthesis")
print("Query:", query3)
print("Retrieved Context:")
for c in retrieved3:
    print("-", c[1])
print("Answer:", answer3)
print("\n" + "="*60 + "\n")


ðŸŸ¦ Test Case 1 â€” Factual
Query: Which player has a shoulder injury?
Retrieved Context:
- Earl Thomas,Free Safety,2015-01-18,Shoulder dislocation,Unknown,Full,Probable,No,"Dislocated shoulder in NFC Championship; practicing with brace."
- J.R. Sweezy,Guard,2015-01-30,Ankle injury,Unknown,Full,Probable,No,"Listed on final injury report with ankle soreness; full participant."
Answer: Earl Thomas


ðŸŸ© Test Case 2 â€” Foil (not in KB)
Query: Who won the Super Bowl in 2020?
Retrieved Context:
- Earl Thomas,Free Safety,2015-01-18,Shoulder dislocation,Unknown,Full,Probable,No,"Dislocated shoulder in NFC Championship; practicing with brace."
- Kam Chancellor,Strong Safety,2015-01-30,Knee ligament strain,Unknown,Limited,Probable,No,"Suffered knee injury during Friday practice before Super Bowl; still active."
Answer: Earl Thomas


ðŸŸ§ Test Case 3 â€” Synthesis
Query: Which players had knee issues before the Super Bowl?
Retrieved Context:
- Kam Chancellor,Strong Safety,2015-01-30,Knee liga