In [6]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from typing import Union

In [7]:
import pandas as pd
df = pd.read_csv('commentary_2025_ENG.1.csv')
data = df.to_dict('records')

In [8]:
import pandas as pd

for col in df.columns:
    if df[col].dtype == 'object':  #if string or mixed data type
        # converting to date time
        converted_col=pd.to_datetime(df[col], errors='coerce', format='%Y-%m-%d %H:%M:%S')
        
        # check if most of the rows are valid date time or not
        if converted_col.notna().sum() > len(df) * 0.5:
            df[col] = converted_col.fillna(pd.Timestamp('1900-01-01')) #if valid assigned a default value
        else:
            df[col] = df[col].fillna('no values')

    elif pd.api.types.is_numeric_dtype(df[col]):  # if Numeric columns seen then value as 0
        df[col] = df[col].fillna(0)

    elif pd.api.types.is_datetime64_any_dtype(df[col]):  # if its a data type then a default value
        df[col] = df[col].fillna(pd.Timestamp('1900-01-01'))


In [9]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
qdrant = QdrantClient(":memory:")

In [11]:
collection_name = "commentary"

#if collection already exists delete it
if qdrant.collection_exists(collection_name=collection_name):
    qdrant.delete_collection(collection_name=collection_name)

In [12]:
#after deletion if exists create collection
qdrant.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE
    )
)

True

In [14]:
# vectorization and points
qdrant.upload_points(
    collection_name=collection_name,
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["commentaryText"]).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

In [15]:
user_prompt = "penalty missed" #providing some user prompt

In [16]:
import pprint
# query encoded to vectors
query_vector = encoder.encode(user_prompt).tolist()


# Perform the search / query
response =  qdrant.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=10,
    with_payload=True
)

for hit in response.points:
    print(hit.payload, "score:", hit.score)

# holding the search results
search_results = [hit.payload for hit in response.points]

{'seasonType': 13481, 'eventId': 740612, 'commentaryOrder': 39, 'playId': 45749168, 'clockDisplayValue': "38'", 'commentaryText': 'Penalty missed. Bruno Fernandes (Manchester United) right footed shot is too high.', 'updateDateTime': '2025-08-25 04:57:58'} score: 0.729119944542675
{'seasonType': 13481, 'eventId': 740649, 'commentaryOrder': 68, 'playId': 45987368, 'clockDisplayValue': "76'", 'commentaryText': 'Penalty saved. Bruno Fernandes (Manchester United) right footed shot saved in the bottom right corner by Caoimhín Kelleher (Brentford).', 'updateDateTime': '2025-09-28 05:17:47'} score: 0.645788672417261
{'seasonType': 13481, 'eventId': 740623, 'commentaryOrder': 51, 'playId': 45792530, 'clockDisplayValue': "59'", 'commentaryText': 'Penalty saved. Kevin Schade (Brentford) right footed shot saved in the bottom left corner by Robin Roefs (Sunderland).', 'updateDateTime': '2025-08-31 05:08:09'} score: 0.6335756659559626
{'seasonType': 13481, 'eventId': 740598, 'commentaryOrder': 2, '

In [22]:
import os
os.environ["GROQ_API_KEY"] = "my-key-here"

In [23]:
import pprint
from groq import Groq


user_prompt = "Penalty Missed" #prompt from user


query_vector = encoder.encode(user_prompt).tolist() #encoding user prompt

#semantic search in quadrant
response = qdrant.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=10,
    with_payload=True
)

# stored search reuslts
search_results = [hit.payload for hit in response.points]


#using groq; s
client = Groq()

completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",  #8 b model
    messages=[
        {
            "role": "system",
            "content": (
                "You are a helpful assistant"
            )
        },
        {"role": "user", "content": "What is Deep Learning?"},
    ]
)

print("\n--- Groq Model Response ---")
print(completion.choices[0].message.content)



--- Groq Model Response ---
Deep learning is a subset of machine learning that uses neural networks to analyze and interpret data. These neural networks are composed of layers of interconnected nodes or "neurons," which work together to learn and improve their performance on specific tasks.

Deep learning is inspired by the structure and function of the human brain. The neural networks are designed to mimic the way the brain processes and interprets information, with multiple layers of abstraction and representation.

There are several key characteristics of deep learning:

1. **Architecture**: Deep learning models consist of multiple layers, each with a specific function or layer type (e.g., convolutional, recurrent, fully connected).
2. **Autoencoding**: Deep learning models learn to represent complex input data in a compressed or encoded form, which can then be used for a variety of tasks (e.g., classification, regression).
3. **Non-linearity**: Deep learning models use non-linear 

In [28]:
import pprint
import json
from groq import Groq

user_prompt = "Penalty Missed" #prompt from user


query_vector = encoder.encode(user_prompt).tolist() #encoding user prompt

#semantic search in quadrant
response = qdrant.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=10,
    with_payload=True
)

# stored search reuslts
search_results = [hit.payload for hit in response.points]
context_text = "\n".join([
    json.dumps(item, ensure_ascii=False, default=str) for item in search_results
])
context_text = context_text.encode("utf-8", errors="ignore").decode()

#using groq; s
client = Groq()

# Chat completion
completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {
            "role": "system",
            "content": (
                "You are a helpful assistant and soccer statistics specialist. "
                "Your top priority is to help the user analyze soccer stats, "
                "summarize insights, and answer queries accurately."
            )
        },
        {
            "role": "user",
            "content": (
                f"The user asked: {user_prompt}\n\n"
                f"Here are related results from the database:\n{context_text}\n\n"
                "Please summarize or analyze the relevant information."
            )
        },
    ]
)

print("\n--- Groq Model Response ---")
print(completion.choices[0].message.content)



--- Groq Model Response ---
Based on the provided data, we can identify the relevant information related to "Penalty Missed." Here's a summary:

1. **Number of Penalties Missed**: In the given data, there are no records of penalties actually being missed, but there are two records of penalties being attempted but not scored (saved) and other shots missing the target.

2. **Penalties Saved**: There are two instances where penalties were saved:
    - **Bruno Fernandes (Manchester United)**: Saved by Caoimhín Kelleher (Brentford) in the bottom right corner.
    - **Danny Welbeck (Brighton and Hove Albion)**: Saved by Jordan Pickford (Everton) in the bottom right corner.
    - **Kevin Schade (Brentford)**: Saved by Robin Roefs (Sunderland) in the bottom left corner.

3. **Other Shots Missed**: Five instances of shots missing the target:
    - **Raúl Jiménez (Fulham)** has two instances, including one from the left side of the box and one from close range.
    - **Samuel Chukwueze (Fulham)