In [1]:
import kagglehub
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OUTPUT_DIR = 'dataset'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
path = kagglehub.dataset_download("grayengineering425/nfl-box-scores")
print("Path to dataset files:", path)
print(os.listdir(path))


Path to dataset files: /home/ryandalal/.cache/kagglehub/datasets/grayengineering425/nfl-box-scores/versions/2
['box_scores.csv', 'nfl_game_weather.csv']


In [4]:
data = pd.read_csv(path + "/box_scores.csv")


data['date'] = pd.to_datetime(data['date'], format='%B %d, %Y')
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

data['game_id'] = data['date'].astype(str) + '_' + data['home'] + '_' + data['visitor']

print(len(data))

data = data[data['year'] == 2016]

data.columns


4328


Index(['date', 'visitor', 'home', 'visitor_score', 'home_score',
       'visitor_first_downs', 'visitor_rushing_first_downs',
       'visitor_passing_first_downs', 'visitor_penalties', 'visitor_net_yards',
       'visitor_net_yards_rushing', 'visitor_rushing_plays',
       'visitor_avg_rush', 'visitor_net_yards_passing',
       'visitor_passing_splits', 'visitor_sack_splits',
       'visitor_gross_passing', 'visitor_yards_per_pass',
       'visitor_punt_splits_avg', 'visitor_punts_blocked',
       'visitor_punt_return_splits', 'visitor_kick_return_splits',
       'visitor_int_return_splits', 'visitor_penalty_splits',
       'visitor_fumble_splits', 'visitor_field_goals',
       'visitor_third_down_splits', 'visitor_fourth_down_splits',
       'visitor_total_plays', 'visitor_avg_gain', 'visitor_time_of_possession',
       'home_first_downs', 'home_rushing_first_downs',
       'home_passing_first_downs', 'home_penalties', 'home_net_yards',
       'home_net_yards_rushing', 'home_rushing_p

Game: 2018-09-09 Patriots vs Chiefs
Date: 2018-09-09
Final Score: Patriots 27, Chiefs 42

Team Stats:
Team | Passing Yards | Rushing Yards | Turnovers
Patriots | 312 | 98 | 1
Chiefs    | 285 | 123 | 0

Top Players:
Player | Team | Passing Yards | Rushing Yards | Receiving Yards | Touchdowns
Tom Brady | Patriots | 312 | 0 | 0 | 2
Patrick Mahomes | Chiefs | 285 | 12 | 45 | 3


In [5]:
for index, row in data.iterrows():
    lines = []

    lines.append(f"{row['date']}, {row['home']} at {row['visitor']}")

    lines.append(f"{row['home']}: {row['home_score']}")
    lines.append(f"{row['visitor']}: {row['visitor_score']}")
    
    lines.append(f"Home First Downs: {row['home_first_downs']}")
    lines.append(f"Visitor First Downs: {row['visitor_first_downs']}")
    lines.append(f"Home Net Yards: {row['home_net_yards']}")
    lines.append(f"Visitor Net Yards: {row['visitor_net_yards']}")
    lines.append(f"Home Time of Possession: {row['home_time_of_possession']}")
    lines.append(f"Visitor Time of Possession: {row['visitor_time_of_possession']}")

    output = '\n'.join(lines)
    filename = f"{OUTPUT_DIR}/{row['game_id']}.txt"
    with open(filename, 'w') as f:
        f.write(output)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

docs = []
filenames = os.listdir(OUTPUT_DIR)
for file in filenames:
    with open(os.path.join(OUTPUT_DIR, file), 'r') as f:
        docs.append(f.read())

# convert all the docs into embeddings
# convert to a tensor to make it easier to work with machine learning
embeddings = model.encode(docs, convert_to_tensor=True)

In [None]:
dimensions = embeddings.shape[1]
# store the embeddings flat - without clustering
# L2 distance - euclidean
index = faiss.IndexFlatL2(dimensions)

# convert the embeddings to numpy arryas
# wouldn't be neccessary if convert_to_tensor was false
embeddings_np = embeddings.cpu().detach().numpy()

# store the embeddings in the index
index.add(embeddings_np)

# index stores no meta data
# create a list of embeddings with dictionaries
# key - index in the embeddings list
# value - file name
index_to_file = {i: f for i, f in enumerate(filenames)}

In [None]:
query = "What was the final score of New England vs Miami on September 18 2016?"
query_emb = model.encode([query])

# search using a query
k = 3
D, I = index.search(np.array(query_emb), k)
# iterate through all of the resulting indexes
for idx in I[0]:
    print('------')
    print(index_to_file[idx])
    # with open(os.path.join(OUTPUT_DIR, index_to_file[idx]), 'r') as f:
    #    print(f.read())


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load LLaMA 2 chat model (HF version)
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")  # uses GPU if available

def answer_question(query, top_k=3, max_tokens=512):
    # Step 1: retrieve relevant docs
    context = retrieve_docs(query, k=top_k)
    
    # Step 2: build prompt
    prompt = f"""
You are a fact-based sports assistant.
Answer the question using ONLY the information below. Do NOT hallucinate.

Context:
{context}

Question:
{query}
"""
    
    # Step 3: tokenize and generate answer
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return answer

del model, tokenizer
import gc
gc.collect()
torch.cuda.empty_cache()  # if using GPU
