In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File paths in Google Drive
events_file_path = '/content/drive/MyDrive/BS_Project/events.csv'
ginf_file_path = '/content/drive/MyDrive/BS_Project/ginf.csv'

# Load datasets
events_data = pd.read_csv(events_file_path)
ginf_data = pd.read_csv(ginf_file_path)




Mounted at /content/drive


In [None]:
# Display basic information about events_data
print(events_data.info())

# Display basic information about match_data
print(ginf_data.info())


In [None]:
# Check for duplicate rows
duplicates = events_data.duplicated()
dupli=ginf_data.duplicated()
# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")
print(f"Number of duplicate rows: {dupli.sum()}")

# Remove duplicate rows
#events_data = events_data.drop_duplicates()


In [None]:
missing_matches = ginf_data[~ginf_data['id_odsp'].isin(events_data['id_odsp'])]
print("Match IDs with no corresponding events:")
print(missing_matches[['id_odsp', 'date', 'league', 'ht', 'at']])


In [None]:

# Sort the missing matches by country
sorted_missing_matches = missing_matches.sort_values(by='country')

# Display the sorted missing matches
print("Matches in ginf.csv not present in events.csv, sorted by country:")
print(sorted_missing_matches[['id_odsp', 'date', 'league', 'ht', 'at', 'country']])

In [None]:
# Convert 'date' column to datetime format
missing_matches['date'] = pd.to_datetime(missing_matches['date'])

# Extract the academic year (August of first year to May of second year)
def extract_academic_year(date):
    if date.month >= 8:
        return date.year
    else:
        return date.year - 1

missing_matches['academic_year'] = missing_matches['date'].apply(extract_academic_year)

# Count missing rows for each country and academic year
missing_counts = missing_matches.groupby(['country', 'academic_year']).size().reset_index(name='missing_count')

# Display the missing counts
print("Number of missing rows for each country and academic year:")
print(missing_counts)

In [4]:
# Merge ginf_data and events_data based on id_odsp
merged_data = pd.merge(ginf_data, events_data, on='id_odsp', how='inner')

# Convert 'date' column to datetime format
merged_data['date'] = pd.to_datetime(merged_data['date'])

# Extract the academic year (August of first year to May of second year)
def extract_academic_year(date):
    if date.month >= 8:
        return date.year
    else:
        return date.year - 1

merged_data['academic_year'] = merged_data['date'].apply(extract_academic_year)

# Get the number of distinct id_odsp grouped by country and academic year
distinct_id_counts = merged_data.groupby(['country', 'academic_year'])['id_odsp'].nunique().reset_index(name='distinct_id_count')

# Display the results
print("Number of distinct id_odsp linked with ginf.csv, grouped by country and academic year:")
print(distinct_id_counts)

Number of distinct id_odsp linked with ginf.csv, grouped by country and academic year:
    country  academic_year  distinct_id_count
0   england           2013                320
1   england           2014                380
2   england           2015                379
3   england           2016                220
4    france           2011                368
5    france           2012                373
6    france           2013                378
7    france           2014                380
8    france           2015                369
9    france           2016                208
10  germany           2011                294
11  germany           2012                305
12  germany           2013                269
13  germany           2014                290
14  germany           2015                297
15  germany           2016                153
16    italy           2011                362
17    italy           2012                379
18    italy           2013             

In [None]:
# Filter the merged dataset for the chosen country and league for the year 2015
chosen_country = "france"
chosen_year = 2015

# filtering
filtered_data = merged_data[(merged_data['country'] == chosen_country) & (merged_data['season'].astype(str).str.contains(str(chosen_year)))]

# Display the filtered data
print(filtered_data)


In [None]:
# Creating correaltion matrix
correlation_matrix = filtered_data.corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot correlation matrix heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()


In [14]:


# Filter the dataset for the chosen country, league, and season
chosen_country = "france"
chosen_year = 2015
filtered_data = merged_data[(merged_data['country'] == chosen_country) & (merged_data['season'].astype(str).str.contains(str(chosen_year)))]

# Define the time intervals (every 3 minutes)
time_intervals = list(range(0, 91, 3))  # Assuming matches are 90 minutes long

# Define event type mappings
event_type_mapping = {
    2: 'Corner',
    3: 'Foul',
    4: 'Yellow card',
    5: 'Second yellow card',
    6: 'Red card',
    7: 'Substitution',
    8: 'Free kick won',
    9: 'Offside',
    10: 'Hand ball',
    11: 'Penalty conceded',
}

# Create an empty DataFrame to store the results
columns = ['match_id', 'time_interval', 'relative_team_strength', 'assists_home', 'assists_away', 'shots_home', 'shots_away', 'shots_on_target_home', 'shots_on_target_away']
columns += [f'{event}_home' for event in event_type_mapping.values()] + [f'{event}_away' for event in event_type_mapping.values()]
columns += ['Own goal_home', 'Own goal_away', 'fthg', 'ftag']  # Include full-time goals
result_df = pd.DataFrame(columns=columns)

# Loop through each match in the filtered dataset
for match_id, match_data in filtered_data.groupby('id_odsp'):
    # Get full-time goals for the current match
    full_time_home_goals = match_data['fthg'].iloc[0]
    full_time_away_goals = match_data['ftag'].iloc[0]

    # Initialize an empty DataFrame to store match data
    match_rows = []

    # Loop through each time interval
    for i in range(len(time_intervals) - 1):
        start_time = time_intervals[i]
        end_time = time_intervals[i + 1]
        interval_label = f"{start_time}-{end_time}"

        # Calculate relative team strength for the current match
        home_team = match_data['ht'].iloc[0]
        away_team = match_data['at'].iloc[0]
        home_strength = relative_strength_home.get(home_team, 0)
        away_strength = relative_strength_away.get(away_team, 0)
        relative_team_strength = home_strength - away_strength

        # Count number of assists for home and away teams in the current interval
        assists_home = len(match_data[(match_data['event_type2'] == 12) & (match_data['side'] == 1) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])
        assists_away = len(match_data[(match_data['event_type2'] == 12) & (match_data['side'] == 2) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])

        # Count number of shots for home and away teams in the current interval
        shots_home = len(match_data[(match_data['event_type'] == 1) & (match_data['side'] == 1) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])
        shots_away = len(match_data[(match_data['event_type'] == 1) & (match_data['side'] == 2) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])

        # Count number of shots on target for home and away teams in the current interval
        shots_on_target_home = len(match_data[(match_data['event_type'] == 10) & (match_data['side'] == 1) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])
        shots_on_target_away = len(match_data[(match_data['event_type'] == 10) & (match_data['side'] == 2) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])

        # Count events for home and away teams in the current interval
        events_home = {event: len(match_data[(match_data['event_type'] == code) & (match_data['side'] == 1) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)]) for code, event in event_type_mapping.items()}
        events_away = {event: len(match_data[(match_data['event_type'] == code) & (match_data['side'] == 2) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)]) for code, event in event_type_mapping.items()}

        # Count Own goals for home and away teams in the current interval
        own_goals_home = len(match_data[(match_data['event_type2'] == 15) & (match_data['side'] == 1) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])
        own_goals_away = len(match_data[(match_data['event_type2'] == 15) & (match_data['side'] == 2) & (match_data['time'] >= start_time) & (match_data['time'] < end_time)])

        # Append the match data to the list of match rows
        match_rows.append([match_id, interval_label, relative_team_strength, assists_home, assists_away, shots_home, shots_away, shots_on_target_home, shots_on_target_away] + list(events_home.values()) + list(events_away.values()) + [own_goals_home, own_goals_away, full_time_home_goals, full_time_away_goals])

    # Concatenate the match rows to the result DataFrame
    result_df = pd.concat([result_df, pd.DataFrame(match_rows, columns=result_df.columns)], ignore_index=True)

# Display the DataFrame
print(result_df)




        match_id time_interval  relative_team_strength assists_home  \
0      00QH2XdM/           0-3                0.182953            0   
1      00QH2XdM/           3-6                0.182953            1   
2      00QH2XdM/           6-9                0.182953            0   
3      00QH2XdM/          9-12                0.182953            0   
4      00QH2XdM/         12-15                0.182953            0   
...          ...           ...                     ...          ...   
11395  zwyvizbA/         75-78                0.130899            0   
11396  zwyvizbA/         78-81                0.130899            1   
11397  zwyvizbA/         81-84                0.130899            0   
11398  zwyvizbA/         84-87                0.130899            0   
11399  zwyvizbA/         87-90                0.130899            0   

      assists_away shots_home shots_away shots_on_target_home  \
0                0          0          0                    0   
1                

In [None]:
# Define a function to determine the winner of each match interval and create a binary target variable
def determine_winner_binary(row):
    if row['fthg'] > row['ftag']:
        return 1  # Home team wins
    elif row['fthg'] < row['ftag']:
        return -1  # Away team wins
    else:
        return 0  # Draw

# Apply the function to create the binary target variable
result_df['target_variable'] = result_df.apply(determine_winner_binary, axis=1)

# Splitting data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(result_df.drop(columns=['match_id', 'target_variable']), result_df['target_variable'], test_size=0.55, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5455, random_state=42)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Exclude the Match ID Column
X_train = X_train.iloc[:, 1:]
X_val = X_val.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]

# Convert Interval Column to Float
def preprocess_interval_column(data):
    interval_values = data[:, 0].astype(str)
    start_values = []
    end_values = []
    for interval in interval_values:
        if '-' in interval:
            start, end = interval.split('-')
            if start.isnumeric() and end.isnumeric():
                start_values.append(float(start))
                end_values.append(float(end))
            else:
                start_values.append(np.nan)
                end_values.append(np.nan)
        else:
            if interval.isnumeric():
                start_values.append(float(interval))
                end_values.append(float(interval))
            else:
                start_values.append(np.nan)
                end_values.append(np.nan)

    start_values = np.nan_to_num(start_values)
    end_values = np.nan_to_num(end_values)
    interval_values = (start_values + end_values) / 2.0
    interval_values = np.round(interval_values).astype(float)
    data[:, 0] = interval_values
    return data

# Preprocess the data
def preprocess_data(data):
    # Apply interval preprocessing
    data = preprocess_interval_column(data)

    # Reshape the data
    data = np.expand_dims(data, axis=-1)  # Add a new axis for the LSTM input

    return data

# Preprocess the data
X_train_preprocessed = preprocess_data(X_train.values)
X_val_preprocessed = preprocess_data(X_val.values)
X_test_preprocessed = preprocess_data(X_test.values)

# Print the shapes of preprocessed data
print("Shape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of X_val_preprocessed:", X_val_preprocessed.shape)
print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)


Shape of X_train_preprocessed: (5129, 31, 1)
Shape of X_val_preprocessed: (2850, 31, 1)
Shape of X_test_preprocessed: (3421, 31, 1)


In [None]:

# Convert the target variables to numpy arrays
y_train_preprocessed = y_train.values
y_val_preprocessed = y_val.values
y_test_preprocessed = y_test.values


In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Convert input arrays to float32 data type
X_train_preprocessed = np.asarray(X_train_preprocessed).astype(np.float32)
y_train_preprocessed = np.asarray(y_train_preprocessed).astype(np.float32)
X_val_preprocessed = np.asarray(X_val_preprocessed).astype(np.float32)
y_val_preprocessed = np.asarray(y_val_preprocessed).astype(np.float32)
X_test_preprocessed = np.asarray(X_test_preprocessed).astype(np.float32)  # Convert test input data
y_test_preprocessed = np.asarray(y_test_preprocessed).astype(np.float32)  # Convert test target data

# Define the LSTM model architecture
model = Sequential([
    LSTM(units=64, input_shape=(X_train_preprocessed.shape[1], X_train_preprocessed.shape[2])),
    Dense(units=1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train_preprocessed, y_train_preprocessed, epochs=10, batch_size=32, validation_data=(X_val_preprocessed, y_val_preprocessed), verbose=1)

# Evaluate the model on the test data
loss = model.evaluate(X_test_preprocessed, y_test_preprocessed)
print("Test Loss:", loss)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.004214517306536436


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predictions on validation set
y_val_pred_probs = model.predict(X_val_preprocessed)

# Adjust threshold
threshold = 0.5
y_val_pred = (y_val_pred_probs > threshold).astype(int)

# Flatten predictions to 1D array
y_val_pred = y_val_pred.flatten()

# Calculate evaluation metrics
accuracy = accuracy_score(y_val_preprocessed, y_val_pred)
precision = precision_score(y_val_preprocessed, y_val_pred, average='macro', zero_division=0)
recall = recall_score(y_val_preprocessed, y_val_pred, average='macro')
f1 = f1_score(y_val_preprocessed, y_val_pred, average='macro')

# Confusion Matrix
conf_matrix = confusion_matrix(y_val_preprocessed, y_val_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7115789473684211
Precision: 0.4819060463025399
Recall: 0.6666666666666666
F1 Score: 0.5388681592039801
Confusion Matrix:
[[   0  822    0]
 [   0  661    0]
 [   0    0 1367]]


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense



# Convert input arrays to float32 data type
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

# Define the LSTM model architecture
def create_model():
    model = Sequential([
        LSTM(units=64, input_shape=(X_train.shape[1], 1)),
        Dense(units=1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create and train the model on the current fold
    model = create_model()
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, verbose=0)

    # Predict on the validation fold
    y_val_pred_probs = model.predict(X_val_fold)

    # Adjust threshold
    threshold = 0.5
    y_val_pred = (y_val_pred_probs > threshold).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    precision = precision_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
    recall = recall_score(y_val_fold, y_val_pred, average='macro')
    f1 = f1_score(y_val_fold, y_val_pred, average='macro')
    conf_matrix = confusion_matrix(y_val_fold, y_val_pred)

    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean and standard deviation of evaluation metrics
mean_accuracy = np.mean(accuracy_scores)
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_conf_matrix = np.mean(conf_matrices, axis=0)

std_accuracy = np.std(accuracy_scores)
std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print mean evaluation metrics and mean confusion matrix
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)
print("Mean Confusion Matrix:")
print(mean_conf_matrix)

# Print standard deviation of evaluation metrics
print("Standard Deviation of Accuracy:", std_accuracy)
print("Standard Deviation of Precision:", std_precision)
print("Standard Deviation of Recall:", std_recall)
print("Standard Deviation of F1 Score:", std_f1)


Mean Accuracy: 0.704813388484762
Mean Precision: 0.4790368511488229
Mean Recall: 0.6666666666666666
Mean F1 Score: 0.5360905160394231
Mean Confusion Matrix:
[[  0.  302.8   0. ]
 [  0.  235.    0. ]
 [  0.    0.  488. ]]
Standard Deviation of Accuracy: 0.010908989657652548
Standard Deviation of Precision: 0.002787760628284027
Standard Deviation of Recall: 0.0
Standard Deviation of F1 Score: 0.0026961569395212047


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization of Evaluation Metrics
evaluation_metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
mean_scores = [mean_accuracy, mean_precision, mean_recall, mean_f1]
std_scores = [std_accuracy, std_precision, std_recall, std_f1]

plt.figure(figsize=(10, 6))
plt.bar(evaluation_metrics, mean_scores, yerr=std_scores, capsize=5, color='skyblue')
plt.title('Mean Evaluation Metrics with Standard Deviation')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.show()

# Visualization of Mean Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(mean_conf_matrix, annot=True, cmap='Blues', fmt='g')
plt.title('Mean Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Model Architecture Visualization
from tensorflow.keras.utils import plot_model

plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)


In [5]:
import pandas as pd

# Filter the dataset for the chosen country
chosen_country = "france"
filtered_data = merged_data[merged_data['country'] == chosen_country]

# Define the time intervals (every 3 minutes)
time_intervals = list(range(0, 91, 3))  # Assuming matches are 90 minutes long

# Define event type mappings
event_type_mapping = {
    2: 'Corner',
    3: 'Foul',
    4: 'Yellow card',
    5: 'Second yellow card',
    6: 'Red card',
    7: 'Substitution',
    8: 'Free kick won',
    9: 'Offside',
    10: 'Hand ball',
    11: 'Penalty conceded',
}

# Create an empty DataFrame to store the results
columns = ['match_id', 'time_interval', 'relative_team_strength', 'assists_home', 'assists_away', 'shots_home', 'shots_away', 'shots_on_target_home', 'shots_on_target_away']
columns += [f'{event}_home' for event in event_type_mapping.values()] + [f'{event}_away' for event in event_type_mapping.values()]
columns += ['Own goal_home', 'Own goal_away', 'fthg', 'ftag']  # Include full-time goals
result_df = pd.DataFrame(columns=columns)

# Compute the relative team strength
# Count matches won and matches played for each team
matches_won_home = filtered_data[filtered_data['fthg'] > filtered_data['ftag']].groupby('ht').size()
matches_played_home = filtered_data.groupby('ht').size()
matches_won_away = filtered_data[filtered_data['ftag'] > filtered_data['fthg']].groupby('at').size()
matches_played_away = filtered_data.groupby('at').size()

# Calculate relative team strength for each team
relative_strength_home = matches_won_home / matches_played_home
relative_strength_away = matches_won_away / matches_played_away

# Loop through each match in the filtered dataset
for match_id, match_data in filtered_data.groupby('id_odsp'):
    # Get full-time goals for the current match
    full_time_home_goals = match_data['fthg'].iloc[0]
    full_time_away_goals = match_data['ftag'].iloc[0]

    # Initialize match rows list to store data for each match
    match_rows = []

    # Calculate relative team strength for the current match
    home_team = match_data['ht'].iloc[0]
    away_team = match_data['at'].iloc[0]
    home_strength = relative_strength_home.get(home_team, 0)
    away_strength = relative_strength_away.get(away_team, 0)
    relative_team_strength = home_strength - away_strength

    # Loop through each time interval
    for i in range(len(time_intervals) - 1):
        start_time = time_intervals[i]
        end_time = time_intervals[i + 1]
        interval_label = f"{start_time}-{end_time}"

        # Filter data for the current time interval
        interval_data = match_data[(match_data['time'] >= start_time) & (match_data['time'] < end_time)]

        # Count number of assists for home and away teams in the current interval
        assists_home = interval_data[(interval_data['event_type2'] == 12) & (interval_data['side'] == 1)].shape[0]
        assists_away = interval_data[(interval_data['event_type2'] == 12) & (interval_data['side'] == 2)].shape[0]

        # Count number of shots for home and away teams in the current interval
        shots_home = interval_data[(interval_data['event_type'] == 1) & (interval_data['side'] == 1)].shape[0]
        shots_away = interval_data[(interval_data['event_type'] == 1) & (interval_data['side'] == 2)].shape[0]

        # Count number of shots on target for home and away teams in the current interval
        shots_on_target_home = interval_data[(interval_data['event_type'] == 10) & (interval_data['side'] == 1)].shape[0]
        shots_on_target_away = interval_data[(interval_data['event_type'] == 10) & (interval_data['side'] == 2)].shape[0]

        # Count events for home and away teams in the current interval
        events_home = {event: interval_data[(interval_data['event_type'] == code) & (interval_data['side'] == 1)].shape[0] for code, event in event_type_mapping.items()}
        events_away = {event: interval_data[(interval_data['event_type'] == code) & (interval_data['side'] == 2)].shape[0] for code, event in event_type_mapping.items()}

        # Count Own goals for home and away teams in the current interval
        own_goals_home = interval_data[(interval_data['event_type2'] == 15) & (interval_data['side'] == 1)].shape[0]
        own_goals_away = interval_data[(interval_data['event_type2'] == 15) & (interval_data['side'] == 2)].shape[0]

        # Append the match data to the list of match rows
        match_rows.append([match_id, interval_label, relative_team_strength, assists_home, assists_away, shots_home, shots_away, shots_on_target_home, shots_on_target_away] + list(events_home.values()) + list(events_away.values()) + [own_goals_home, own_goals_away, full_time_home_goals, full_time_away_goals])

    # Concatenate the match rows to the result DataFrame
    result_df = pd.concat([result_df, pd.DataFrame(match_rows, columns=result_df.columns)], ignore_index=True)

# Display the DataFrame
print(result_df)


        match_id time_interval  relative_team_strength assists_home  \
0      00OX4xFp/           0-3                0.228206            1   
1      00OX4xFp/           3-6                0.228206            0   
2      00OX4xFp/           6-9                0.228206            0   
3      00OX4xFp/          9-12                0.228206            0   
4      00OX4xFp/         12-15                0.228206            1   
...          ...           ...                     ...          ...   
62275  zwyvizbA/         75-78                0.130899            0   
62276  zwyvizbA/         78-81                0.130899            1   
62277  zwyvizbA/         81-84                0.130899            0   
62278  zwyvizbA/         84-87                0.130899            0   
62279  zwyvizbA/         87-90                0.130899            0   

      assists_away shots_home shots_away shots_on_target_home  \
0                0          1          0                    0   
1                

In [6]:
# Define a function to determine the winner of each match interval and create a binary target variable
def determine_winner_binary(row):
    if row['fthg'] > row['ftag']:
        return 1  # Home team wins
    elif row['fthg'] < row['ftag']:
        return -1  # Away team wins
    else:
        return 0  # Draw

# Apply the function to create the binary target variable
result_df['target_variable'] = result_df.apply(determine_winner_binary, axis=1)

# Splitting data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(result_df.drop(columns=['match_id', 'target_variable']), result_df['target_variable'], test_size=0.55, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5455, random_state=42)


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split

# Exclude the Match ID Column
X_train = X_train.iloc[:, 1:]
X_val = X_val.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]

# Convert Interval Column to Float
def preprocess_interval_column(data):
    interval_values = data[:, 0].astype(str)
    start_values = []
    end_values = []
    for interval in interval_values:
        if '-' in interval:
            start, end = interval.split('-')
            if start.isnumeric() and end.isnumeric():
                start_values.append(float(start))
                end_values.append(float(end))
            else:
                start_values.append(np.nan)
                end_values.append(np.nan)
        else:
            if interval.isnumeric():
                start_values.append(float(interval))
                end_values.append(float(interval))
            else:
                start_values.append(np.nan)
                end_values.append(np.nan)

    start_values = np.nan_to_num(start_values)
    end_values = np.nan_to_num(end_values)
    interval_values = (start_values + end_values) / 2.0
    interval_values = np.round(interval_values).astype(float)
    data[:, 0] = interval_values
    return data

# Preprocess the data
def preprocess_data(data):
    # Apply interval preprocessing
    data = preprocess_interval_column(data)

    # Reshape the data
    data = np.expand_dims(data, axis=-1)

    return data

# Preprocess the data
X_train_preprocessed = preprocess_data(X_train.values)
X_val_preprocessed = preprocess_data(X_val.values)
X_test_preprocessed = preprocess_data(X_test.values)

# Print the shapes of preprocessed data
print("Shape of X_train_preprocessed:", X_train_preprocessed.shape)
print("Shape of X_val_preprocessed:", X_val_preprocessed.shape)
print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)


Shape of X_train_preprocessed: (28026, 31, 1)
Shape of X_val_preprocessed: (15568, 31, 1)
Shape of X_test_preprocessed: (18686, 31, 1)


In [8]:

# Convert the target variables to numpy arrays
y_train_preprocessed = y_train.values
y_val_preprocessed = y_val.values
y_test_preprocessed = y_test.values


In [8]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Convert input arrays to float32 data type
X_train_preprocessed = np.asarray(X_train_preprocessed).astype(np.float32)
y_train_preprocessed = np.asarray(y_train_preprocessed).astype(np.float32)
X_val_preprocessed = np.asarray(X_val_preprocessed).astype(np.float32)
y_val_preprocessed = np.asarray(y_val_preprocessed).astype(np.float32)
X_test_preprocessed = np.asarray(X_test_preprocessed).astype(np.float32)  # Convert test input data
y_test_preprocessed = np.asarray(y_test_preprocessed).astype(np.float32)  # Convert test target data

# Define the LSTM model architecture
model = Sequential([
    LSTM(units=64, input_shape=(X_train_preprocessed.shape[1], X_train_preprocessed.shape[2])),
    Dense(units=1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train_preprocessed, y_train_preprocessed, epochs=10, batch_size=32, validation_data=(X_val_preprocessed, y_val_preprocessed), verbose=1)

# Evaluate the model on the test data
loss = model.evaluate(X_test_preprocessed, y_test_preprocessed)
print("Test Loss:", loss)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.2880431060912088e-05


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predictions on validation set
y_val_pred_probs = model.predict(X_val_preprocessed)

# Adjust threshold
threshold = 0.5
y_val_pred = (y_val_pred_probs > threshold).astype(int)

# Flatten predictions to 1D array
y_val_pred = y_val_pred.flatten()

# Calculate evaluation metrics
accuracy = accuracy_score(y_val_preprocessed, y_val_pred)
precision = precision_score(y_val_preprocessed, y_val_pred, average='macro', zero_division=0)
recall = recall_score(y_val_preprocessed, y_val_pred, average='macro')
f1 = f1_score(y_val_preprocessed, y_val_pred, average='macro')

# Confusion Matrix
conf_matrix = confusion_matrix(y_val_preprocessed, y_val_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7329779033915724
Precision: 0.5016081000595592
Recall: 0.6666666666666666
F1 Score: 0.5569803952610887
Confusion Matrix:
[[   0 4157    0]
 [   0 4238    0]
 [   0    0 7173]]


In [10]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense



# Convert input arrays to float32 data type
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

# Define the LSTM model architecture
def create_model():
    model = Sequential([
        LSTM(units=64, input_shape=(X_train.shape[1], 1)),
        Dense(units=1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrices = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create and train the model on the current fold
    model = create_model()
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, verbose=0)

    # Predict on the validation fold
    y_val_pred_probs = model.predict(X_val_fold)

    # Adjust threshold
    threshold = 0.5
    y_val_pred = (y_val_pred_probs > threshold).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    precision = precision_score(y_val_fold, y_val_pred, average='macro', zero_division=0)
    recall = recall_score(y_val_fold, y_val_pred, average='macro')
    f1 = f1_score(y_val_fold, y_val_pred, average='macro')
    conf_matrix = confusion_matrix(y_val_fold, y_val_pred)

    # Append evaluation metrics to lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    conf_matrices.append(conf_matrix)

# Calculate mean and standard deviation of evaluation metrics
mean_accuracy = np.mean(accuracy_scores)
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)
mean_conf_matrix = np.mean(conf_matrices, axis=0)

std_accuracy = np.std(accuracy_scores)
std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print mean evaluation metrics and mean confusion matrix
print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Mean Recall:", mean_recall)
print("Mean F1 Score:", mean_f1)
print("Mean Confusion Matrix:")
print(mean_conf_matrix)

# Print standard deviation of evaluation metrics
print("Standard Deviation of Accuracy:", std_accuracy)
print("Standard Deviation of Precision:", std_precision)
print("Standard Deviation of Recall:", std_recall)
print("Standard Deviation of F1 Score:", std_f1)


Mean Accuracy: 0.7298935924075232
Mean Precision: 0.5011150730396066
Mean Recall: 0.6666666666666666
Mean F1 Score: 0.5565395280187566
Mean Confusion Matrix:
[[   0.  1514.     0. ]
 [   0.  1534.4    0. ]
 [   0.     0.  2556.8]]
Standard Deviation of Accuracy: 0.0029731060209175183
Standard Deviation of Precision: 0.0016814053038209068
Standard Deviation of Recall: 0.0
Standard Deviation of F1 Score: 0.0014895854963022463


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization of Evaluation Metrics
evaluation_metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
mean_scores = [mean_accuracy, mean_precision, mean_recall, mean_f1]
std_scores = [std_accuracy, std_precision, std_recall, std_f1]

plt.figure(figsize=(10, 6))
plt.bar(evaluation_metrics, mean_scores, yerr=std_scores, capsize=5, color='skyblue')
plt.title('Mean Evaluation Metrics with Standard Deviation')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.show()

# Visualization of Mean Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(mean_conf_matrix, annot=True, cmap='Blues', fmt='g')
plt.title('Mean Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Model Architecture Visualization
from tensorflow.keras.utils import plot_model

plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)


In [None]:
# Bar plot for comparison of different event types
event_types = [f'{event}_home' for event in event_type_mapping.values()] + [f'{event}_away' for event in event_type_mapping.values()]
event_counts = result_df[event_types].sum()
plt.figure(figsize=(12, 6))
sns.barplot(x=event_counts.index, y=event_counts.values, palette='Set3')
plt.title('Comparison of Event Types')
plt.xlabel('Event Type')
plt.ylabel('Total Count')
plt.xticks(rotation=45)
plt.show()
