# AI chatbot using RAG
This chatbot answers questions related to premier league season 24/25

In [12]:
import pandas as pd

In [16]:
# loading the data and turning into chunks
df = pd.read_csv('../data/pl.csv')
df.head()

cols = ['Player Name', 'Club', 'Nationality', 'Position', 'Appearances',
       'Minutes', 'Goals', 'Assists', 'Shots', 'Shots On Target',
        'Big Chances Missed', 'Offsides',
       'Touches', 'Passes', 'Successful Passes', 'Crosses',
       'Successful Crosses',
        'Possession Won', 'Dispossessed',
       'Clean Sheets', 'Clearances', 'Interceptions', 'Blocks', 'Tackles',
        'Aerial Duels','Goals Conceded', 'Own Goals', 'Fouls',
       'Yellow Cards', 'Red Cards', 'Saves', 'Penalties Saved']

chunks = []
for _, row in df.iterrows():
    chunk_lines = []
    for col in cols:
        chunk_lines.append(f"{col}: {row[col]}")
    chunk = "\n".join(chunk_lines)
    chunks.append(chunk)


In [17]:
chunks

['Player Name: Ben White\nClub: Arsenal\nNationality: England\nPosition: DEF\nAppearances: 17\nMinutes: 1198\nGoals: 0\nAssists: 2\nShots: 9\nShots On Target: 12\nBig Chances Missed: 0\nOffsides: 1\nTouches: 833\nPasses: 1678\nSuccessful Passes: 1493\nCrosses: 51\nSuccessful Crosses: 10\nPossession Won: 107\nDispossessed: 6\nClean Sheets: 5\nClearances: 38\nInterceptions: 23\nBlocks: 6\nTackles: 20\nAerial Duels: 16\nGoals Conceded: 0\nOwn Goals: 0\nFouls: 10\nYellow Cards: 2\nRed Cards: 0\nSaves: 0\nPenalties Saved: 0',
 'Player Name: Bukayo Saka\nClub: Arsenal\nNationality: England\nPosition: MID\nAppearances: 25\nMinutes: 1735\nGoals: 6\nAssists: 10\nShots: 67\nShots On Target: 2\nBig Chances Missed: 8\nOffsides: 7\nTouches: 1094\nPasses: 643\nSuccessful Passes: 556\nCrosses: 1\nSuccessful Crosses: 0\nPossession Won: 44\nDispossessed: 40\nClean Sheets: 2\nClearances: 6\nInterceptions: 15\nBlocks: 14\nTackles: 29\nAerial Duels: 45\nGoals Conceded: 0\nOwn Goals: 0\nFouls: 15\nYellow C

In [18]:
# embedding for chunks
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks, show_progress_bar=True)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

In [21]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
client = chromadb.Client(Settings(
    persist_directory="./chroma_db"
))
collection = client.create_collection(name="premier_league")


In [20]:
client.delete_collection(name='premier_league')

In [22]:
ids = [f"player_{i}" for i in range(len(chunks))]
metadatas = [{"source": "premier_league"} for _ in chunks]


collection.add(
    documents=chunks,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)


In [29]:
query = "Can you compare Mohammad Salah's performance with Erling Haland"
query_embedding = model.encode([query]).tolist()
results = collection.query(
    query_embeddings=query_embedding,
    n_results=20
)
# print("Top matches:")
# for doc in results['documents'][0]:
#     print("-", doc[:120])

context = "\n\n".join(results['documents'][0])
# print(context)


In [31]:
import requests
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.getenv("OPENAI_KEY")
client = OpenAI(api_key=api_key)


response = client.chat.completions.create(
    model="gpt-4o",  # or "gpt-4-turbo" or "gpt-3.5-turbo"
    messages=[
        {
            "role": "system",
            "content": "You are a soccer expert. ONLY use the context given. Do NOT guess or use outside info."
        },
        {
            "role": "user",
            "content": f"""
Context:
{context}

Question:
{query}
"""
        }
    ]
)
print(response.choices[0].message.content)


Based on the provided statistics, here's a comparison between Mohamed Salah and Erling Haaland:

**Goals and Assists:**
- Mohamed Salah scored 29 goals and provided 18 assists.
- Erling Haaland scored 22 goals and provided 3 assists.

**Shooting:**
- Salah took 130 shots with 22 on target.
- Haaland took 108 shots with 6 on target.

**Big Chances Missed:**
- Salah missed 24 big chances.
- Haaland missed 21 big chances.

**Minutes and Appearances:**
- Salah played 3377 minutes across 38 appearances.
- Haaland played 2741 minutes across 31 appearances.

**Offsides:**
- Salah was caught offside 18 times.
- Haaland was caught offside 4 times.

**Touches and Passing:**
- Salah had 1864 touches, completed 691 successful passes out of 769 attempts.
- Haaland had 696 touches, completed 904 successful passes out of 1032 attempts.

**Defensive Contributions:**
- Salah contributed with 8 clean sheets, 5 clearances, 4 interceptions, and 21 tackles.
- Haaland had 6 clean sheets, 22 clearances, 24 i

In [28]:
df[df['Player Name']=='Kevin De Bruyne']

Unnamed: 0,Player Name,Club,Nationality,Position,Appearances,Minutes,Goals,Assists,Shots,Shots On Target,...,Fouls,Yellow Cards,Red Cards,Saves,Saves %,Penalties Saved,Clearances Off Line,Punches,High Claims,Goals Prevented
343,Kevin De Bruyne,Manchester City,Belgium,MID,28,1704,4,7,57,27,...,17,2,0,0,0%,0,0,0,0,0.0
