<a href="https://colab.research.google.com/github/runnithan03/Dissertation/blob/main/Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#complete pipeline to predict rugby match outcomes and point margins using historical data and machine learning techniques.

Data Collection


In [None]:
import pandas as pd

# Load dataset (replace with the actual dataset path)
df = pd.read_csv('rugby_dataset.csv')

# Display first few rows to understand the data structure
print(df.head())

Data Preprocessing and Coding Section

Feature Engineering and Data Splitting

In [None]:
# Consider games from 1996 onwards so the rankings have had time to calibrate
data = df.copy()
data = data[data['date'] > '1996-01-01']

# Encode the 'neutral' and 'world_cup' columns as binary indicator variables
data['neutral'] = data['neutral'].astype(int)
data['world_cup'] = data['world_cup'].astype(int)

# Split into train and test datasets, using 01/01/2017 as the cut-off point
train_data = data[data['date'] < '2017-01-01']
test_data = data[data['date'] >= '2017-01-01']

# Define features and target variables for training and testing
X_train = train_data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_train = train_data['margin']
X_test = test_data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_test = test_data['margin']


Data Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Building and Evaluation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regressor': SVR()
}

# Iterate over models to train, predict, and evaluate
for model_name, model in models.items():
    # Fit the model to the training data
    model.fit(X_train_scaled, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test_scaled)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print results
    print(f"Model: {model_name}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R-squared (R2): {r2:.2f}")
    print("="*50)


Predicting Match Winner using Classification Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Redefine the target variables for classification (match result)
y_train_class = train_data['result']
y_test_class = test_data['result']

# Initialize and fit the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train_class)

# Predict on the test data
y_pred_class = clf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test_class, y_pred_class)
classification_rep = classification_report(y_test_class, y_pred_class)

# Print results
print('Random Forest Results')
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_rep)
print("="*50)


Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and fit the Logistic Regression classifier
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train_class)

# Predict on the test data
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
accuracy_logreg = accuracy_score(y_test_class, y_pred_logreg)
classification_rep_logreg = classification_report(y_test_class, y_pred_logreg)

# Print results for Logistic Regression
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_logreg:.2f}")
print("Classification Report:")
print(classification_rep_logreg)
print("="*50)


Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and fit a Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_scaled, y_train_class)

# Predict on the test data
y_pred_decision_tree = decision_tree.predict(X_test_scaled)

# Evaluate the Decision Tree model
accuracy_decision_tree = accuracy_score(y_test_class, y_pred_decision_tree)
classification_rep_decision_tree = classification_report(y_test_class, y_pred_decision_tree)

# Print results for Decision Tree
print("Decision Tree Results:")
print(f"Accuracy: {accuracy_decision_tree:.2f}")
print("Classification Report:")
print(classification_rep_decision_tree)
print("="*50)


Predicting Matches

In [None]:
# Train on all data pre-RWC 2023 and test on the RWC games
X_train_wc = data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_train_wc = data['result']
X_test_wc = wc_games[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_test_wc = wc_games['result']

# Scale the data
X_train_wc_scaled = scaler.fit_transform(X_train_wc)
X_test_wc_scaled = scaler.transform(X_test_wc)

# Initialize and fit a Logistic Regression classifier
logreg_wc = LogisticRegression(random_state=42)
logreg_wc.fit(X_train_wc_scaled, y_train_wc)

# Predict on the World Cup test data
y_pred_wc = logreg_wc.predict(X_test_wc_scaled)

# Evaluate the Logistic Regression model on World Cup games
accuracy_wc = accuracy_score(y_test_wc, y_pred_wc)
classification_rep_wc = classification_report(y_test_wc, y_pred_wc)

# Print results for World Cup prediction
print("Logistic Regression Results on World Cup Games:")
print(f"Accuracy: {accuracy_wc:.2f}")
print("Classification Report:")
print(classification_rep_wc)
print("="*50)
