In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('SP1.csv')

# --- Target Variable Encoding ---
# 'H' (Home Win) -> 1, 'D' (Draw) -> 0, 'A' (Away Win) -> -1
# Alternatively: 'H' -> 2, 'D' -> 1, 'A' -> 0 for a Multi-Class Classifier
df['Target'] = df['FTR'].astype('category').cat.codes 
# e.g., using cat.codes might map H, D, A to 1, 0, 2 (check the mapping and adjust if necessary)
# Let's manually map for clarity:
# df['Target'] = df['FTR'].map({'H': 2, 'D': 1, 'A': 0})

In [17]:
# --- Select the features for the prototype ---
# Using the best available pre-match odds (e.g., Pinnacle Sports PSH/PSD/PSA or Closing Odds)
features = ['PSH', 'PSD', 'PSA'] 
# Drop rows where any of these key features are missing
df_model = df.dropna(subset=features + ['Target'])

X = df_model[features]
y = df_model['Target']

In [None]:
# Random split for simplicity in prototype
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)