In [3]:
#!pip install transformers sentence-transformers faiss-cpu


In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import pickle
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load the first CSV data (season-level data)
season_data = pd.read_csv('epl_season_1993_2024.csv')
season_data['text'] = season_data.apply(lambda row: f"In {row['Season_End_Year']}, {row['Champion']} won the championship with {row['Total_Goals']} goals. The runner-up was {row['Runners']}.", axis=1)

# Load the second CSV data (match-level data)
match_data = pd.read_csv('premier-league-matches.csv')
match_data['text'] = match_data.apply(lambda row: f"On {row['Date']}, {row['Home']} played against {row['Away']}. The match ended {row['HomeGoals']}-{row['AwayGoals']} with {row['FTR']} as the final result.", axis=1)


In [3]:
# Combine both datasets into one DataFrame
combined_data = pd.concat([season_data['text'], match_data['text']], ignore_index=True)

# 2. Create Embeddings and FAISS Index

# Initialize the model for creating embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for the combined data
embeddings = embedding_model.encode(combined_data.tolist())

# Create a FAISS index for efficient similarity search
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# 3. Save the FAISS Index and Embeddings

# Save the FAISS index
faiss.write_index(index, 'faiss_index.idx')

# Save the combined data with embeddings
combined_data.to_csv('combined_data_with_embeddings.csv', index=False)




In [5]:
# Save the embedding model (if needed later)
with open('model pickel/embedding_model.pkl', 'wb') as f:
    pickle.dump(embedding_model, f)

In [6]:
# Load the pre-trained LLM (GPT-2 in this case)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Save the tokenizer and model for later use
tokenizer.save_pretrained('gpt2_tokenizer')
model.save_pretrained('gpt2_model')

# 5. (Optional) Test the Saved Components

# Test loading the FAISS index
loaded_index = faiss.read_index('faiss_index.idx')

# Test loading the combined data
loaded_combined_data = pd.read_csv('combined_data_with_embeddings.csv')

# Test loading the embedding model
with open('model pickel/embedding_model.pkl', 'rb') as f:
    loaded_embedding_model = pickle.load(f)




In [7]:
# Test loading the GPT-2 model and tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained("gpt2_tokenizer")
loaded_model = AutoModelForCausalLM.from_pretrained("gpt2_model")

# Print a sample output to ensure everything is working correctly
print(loaded_combined_data.head())

                                                text
0  In 1993, Manchester United won the championshi...
1  In 1994, Manchester United won the championshi...
2  In 1995, Blackburn Rovers won the championship...
3  In 1996, Manchester United won the championshi...
4  In 1997, Manchester United won the championshi...
