# ⚽ **HOL: Eredivisie Prediction** 🥇
### Notebook - Prediction Time! - 4/4

---

### What We'll Do:
1. **Data Ingestion**: Fetch Eredivisie data from the GitHub repository.
2. **Data Transformation**: Utilize Snowpark DataFrames for data preparation and analysis.
3. **Model Training**: Train model and store it in the Snowflake Model Registry
4. -> **Prediction**: Predict who is going to win Eredivisie 2024/2025

![image](https://i.makeagif.com/media/1-14-2021/VPPRyU.gif)

### Setup

Make sure following packages are imported
- `snowflake-ml-python`
- `snowflake-snowpark-python`

## Step 4: Inference

### 1. Show Fixture Data

- **Objective:** Provide an overview of the fixture data for the Eredivisie 2024/2025

### 2. Prep Data for Prediction

- **Objective:** Prepare the data for predictive modeling.

- **Data Preprocessing**: Clean and transform the fixture data as necessary for prediction tasks.

### 3. Predict Results

- **Objective:** Utilize predictive models to forecast match outcomes.

### 4. Show Results

- **Objective:** Present the predicted results of the Eredivisie 2024/2025 matches.

- **Display Predicted Outcomes**: Showcase the forecasted results of each match and final ranking.

In [None]:
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import Window
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import udf, udtf
from snowflake.snowpark.types import IntegerType, FloatType, StringType, StructField, StructType, DateType
    
import pandas as pd
import numpy as np

import streamlit as st

import warnings
warnings.filterwarnings('ignore')

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()
user_name = session.sql('select current_user()').collect()[0][0]

In [None]:
# Enrich Fixture with all parameters for model inference.
df_features = session.table('eredivisie_features')
st.dataframe(df_features.limit(50))

In [None]:
# lets make a copy of the fixtures, one with original data and then one we'll update as we go along
df_fixture_copy = session.table('eredivisie_fixture')
df_fixture_copy.write.save_as_table(f'fixture_{user_name}',mode='overwrite')

st.dataframe(df_fixture_copy.limit(50))

In [None]:
# Convert DATE columns to date type if they're not already
df_fixture_copy = df_fixture_copy.with_column("DATE", F.to_date(F.col("DATE"), "DD/MM/YYYY HH24:MI"))
df_features_new = df_features.with_column("DATE", F.to_date(df_features['DATE']))

# Perform the ASOF join
result_df = df_fixture_copy.join(
    df_features_new,
    on=(df_fixture_copy["FIX_HOMETEAM"] == df_features_new["HOMETEAM"]),
    how="asof",
    match_condition=(df_fixture_copy["DATE"] >= df_features_new["DATE"])
).select(
    df_fixture_copy["MATCHNUMBER"],
    df_fixture_copy["ROUNDNUMBER"],
    df_fixture_copy["DATE"].as_("DATE"),
    df_fixture_copy["LOCATION"],
    df_fixture_copy["FIX_HOMETEAM"],
    df_fixture_copy["FIX_AWAYTEAM"],
    df_fixture_copy["RESULT"],
    # Specified columns from df_features
    df_features["HOME_WIN_PERCENTAGE_LAST34"],
    df_features["HOME_GOALS_FOR_LAST34"],
    df_features["HOME_GOALS_AGAINST_LAST34"])
df_features_new = df_features_new.withColumn("DATE", F.to_date(F.col("DATE")))

# Perform the ASOF join
result_df_new = result_df.join(
    df_features_new,
    on=(result_df["FIX_HOMETEAM"] 
        == df_features_new["HOMETEAM"]) & (result_df["FIX_AWAYTEAM"] == df_features_new["AWAYTEAM"]),
    how="asof",
    match_condition=(result_df["DATE"] >= df_features_new["DATE"])
).select(
    result_df["MATCHNUMBER"],
    result_df["ROUNDNUMBER"],
    result_df["DATE"].as_("DATE"),
    result_df["LOCATION"],
    result_df["FIX_HOMETEAM"],
    result_df["FIX_AWAYTEAM"],
    result_df["RESULT"],
    df_features_new["HOME_WINS_LAST34"].as_("HOME_WINS_LAST34"),
    df_features_new["HOME_GOALS_AGAINST_LAST34"].as_("HOME_GOALS_AGAINST_LAST34"),
    df_features_new["HOME_WIN_PERCENTAGE_LAST34"].as_("HOME_WIN_PERCENTAGE_LAST34"),
    df_features_new["H2H_HOME_LOSSES_LAST10"],
    df_features_new["H2H_HOME_WINS_LAST10"])


print(f"total row before joins: {df_fixture_copy.count()}")
print(f"total row after joins: {result_df_new.count()}")
#st.dataframe(result_df_new.filter(F.col("HOME_WIN_PERCENTAGE_LAST5").isNull()))
#st.dataframe(result_df_new.filter(F.col("H2H_HOME_WINS_LAST5").isNull())) 
#There are some match ups that has never had this H2H
#df_features_new.filter((F.col("HOMETEAM") == "Almere City FC") & (F.col("AWAYTEAM") == "FC Groningen")).show()

In [None]:
# Run predictions on all the season games
from snowflake.ml.registry import Registry
from snowflake.snowpark.functions import col, when, greatest
from snowflake.snowpark.types import IntegerType, DecimalType, DoubleType, LongType

reg = Registry(session=session)

mv = reg.get_model("EREDIVISIE_PREDICT").default


# Fill missing values if needed (warnings are not impacting the execution, 
# we might adjust functions to fillna only for used features
# Identify columns that are of IntegerType
numeric_columns = [
    col.name for col in result_df_new.schema 
    if isinstance(col.datatype, (IntegerType, DecimalType, DoubleType, LongType))
]
# Fill NaN values only in integer columns
result_df_filled = result_df_new.fillna(-1, subset=numeric_columns)
prediction = mv.run(result_df_filled, function_name="predict_proba")
#prediction = prediction.with_column('output_game_outcome',F.iff(F.col('predict_proba_1.0') > F.col('predict_proba_2'),1,2))
prediction = prediction.with_column(
    'RESULT', 
    when(greatest(col('PREDICT_PROBA_0'), col('PREDICT_PROBA_1'), col('PREDICT_PROBA_2')) == col('PREDICT_PROBA_1'), 'home')
    .when(greatest(col('PREDICT_PROBA_0'), col('PREDICT_PROBA_1'), col('PREDICT_PROBA_2')) == col('PREDICT_PROBA_2'), 'away')
    .otherwise('draw')
)

prediction.write.save_as_table('eredivisie_predictions',mode='overwrite')

st.dataframe(session.table('eredivisie_predictions'))

In [None]:
# It's time to see who is gonna win the Eredivisie!
# Lets take the output of our predictions and build up the final ranking of next season

from snowflake.snowpark.functions import col, when, greatest

# Load the predictions table
predictions_df = session.table('eredivisie_predictions')

# Assign points based on the result
# Assign points, wins, and draws based on the result
predictions_df = predictions_df.with_column(
    'HOME_POINTS', 
    when(col('RESULT') == 'home', 3).when(col('RESULT') == 'draw', 1).otherwise(0)
).with_column(
    'AWAY_POINTS', 
    when(col('RESULT') == 'away', 3).when(col('RESULT') == 'draw', 1).otherwise(0)
).with_column(
    'HOME_WINS', 
    when(col('RESULT') == 'home', 1).otherwise(0)
).with_column(
    'AWAY_WINS', 
    when(col('RESULT') == 'away', 1).otherwise(0)
).with_column(
    'HOME_DRAWS', 
    when(col('RESULT') == 'draw', 1).otherwise(0)
).with_column(
    'AWAY_DRAWS', 
    when(col('RESULT') == 'draw', 1).otherwise(0)
)

# Group by home and away teams to sum points and count wins, draws
home_ranking_df = predictions_df.group_by('FIX_HOMETEAM').agg({
    'HOME_POINTS': 'sum',
    'HOME_WINS': 'sum',
    'HOME_DRAWS': 'sum'
}).with_column_renamed('SUM(HOME_POINTS)', 'TOTAL_POINTS').with_column_renamed('SUM(HOME_WINS)', 'WINS').with_column_renamed('SUM(HOME_DRAWS)', 'DRAWS').with_column_renamed('FIX_HOMETEAM', 'TEAM')

away_ranking_df = predictions_df.group_by('FIX_AWAYTEAM').agg({
    'AWAY_POINTS': 'sum',
    'AWAY_WINS': 'sum',
    'AWAY_DRAWS': 'sum'
}).with_column_renamed('SUM(AWAY_POINTS)', 'TOTAL_POINTS').with_column_renamed('SUM(AWAY_WINS)', 'WINS').with_column_renamed('SUM(AWAY_DRAWS)', 'DRAWS').with_column_renamed('FIX_AWAYTEAM', 'TEAM')

# Combine home and away rankings
ranking_df = home_ranking_df.union_all(away_ranking_df)

# Final ranking by summing points, wins, and draws
final_ranking_df = ranking_df.group_by('TEAM').agg({
    'TOTAL_POINTS': 'sum',
    'WINS': 'sum',
    'DRAWS': 'sum'
}).with_column_renamed('SUM(TOTAL_POINTS)', 'FINAL_POINTS').with_column_renamed('SUM(WINS)', 'TOTAL_WINS').with_column_renamed('SUM(DRAWS)', 'TOTAL_DRAWS')

# Calculate losses: Total matches - (wins + draws)
total_matches = 34  # Total matches based on predictions
final_ranking_df = final_ranking_df.with_column(
    'LOSSES', 
    (total_matches - col('TOTAL_WINS') - col('TOTAL_DRAWS'))
)

# Sort the teams by their final points and wins
final_ranking_df = final_ranking_df.sort(col('FINAL_POINTS').desc(), col('TOTAL_WINS').desc())

# Show the final ranking
st.title("The winner is...")
st.dataframe(final_ranking_df)

![image](https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgnz4WWG3t82n2IWLd8nKKQtM2FhWDbQwVlopTO7YG8WahLDoX8AtnE6reEwCkxdVkAQ8t5dtzQ_znX1sx9bru6U0sjLmnXIuv6hqz_aUPEeYwgJs7kDRL-hzRmBXN8NJjUyFL9wxIr2E4/s0-rw/PSV_Eindhoven_flag.gif)

# Final Summary

Let's recap what we covered:

1) Ingested our raw data
2) Performed various steps of data transformation using Snowpark
3) Used **Snowpark ML** to run **Hyperparameter Tuning**, **Model Training** and stored our model in the **Snowflake Model Registry**
4) Registered various Python UDTFs and Stored Procedures for code reusability, to streamline our inference pipeline
5) Used our model to predict all Eredivisie matches and build forecasted final ranking.

# What does Cortex say? ⚽

This isn't the right use of LLMs but for fun, let's see what [Cortex](https://docs.snowflake.com/user-guide/snowflake-cortex/llm-functions) has to say about the Eredivisie 2024/2025 outcome and compare to our prediction.

#### Note 
_The following models are only [supported](https://docs.snowflake.com/user-guide/snowflake-cortex/llm-functions#availability) in certain CSP regions at present so you will need to run this in a compatible setup._

In [None]:
prompt = """
    Please respond to the following type of question with a single word along with the country flag emoji, naming a country. 
    The answer should be consider a classification response, for example:

    Question: Based on their performances up until 2022, which country was most likely to win the Serie A 2023?
    Response:Intern 🏴󠁧󠁢󠁥󠁮󠁧󠁿

    Question: Based on their performances up until 2020, which country was most likely to win the Ligue 1 2023?
    Response:PSG 🇫🇷

    Now, based on historical performances and achievements in international and club football up to 2024,
    factoring in international players, and the location of the event, which club is most likely to win Eredivisie 2024/2025?
    
    Rules:
    1) Do not include any additional text or additional spaces before or after the country name. 
    2) Please strip out any whitespace before and after the answer.
"""

arctic = session.sql(f"select snowflake.cortex.complete(\'snowflake-arctic\','{prompt}')").collect()[0][0]
llama_3_8b = session.sql(f"select snowflake.cortex.complete(\'llama3-70b\','{prompt}')").collect()[0][0]
llama_3_70b = session.sql(f"select snowflake.cortex.complete(\'llama3-70b\','{prompt}')").collect()[0][0]
mistral_large = session.sql(f"select snowflake.cortex.complete(\'mistral-large\','{prompt}')").collect()[0][0]
mixtral_8_7b = session.sql(f"select snowflake.cortex.complete(\'mixtral-8x7b\','{prompt}')").collect()[0][0]
mixtral_8_7b = session.sql(f"select snowflake.cortex.complete(\'mixtral-8x7b\','{prompt}')").collect()[0][0]
reka_flash = session.sql(f"select snowflake.cortex.complete(\'reka-flash\','{prompt}')").collect()[0][0]

df = session.create_dataframe(
    [
        ('snowflake-arctic', arctic.lstrip().rstrip()),
        ('llama3-8b', llama_3_8b.lstrip().rstrip()), 
        ('llama3-70b', llama_3_70b.lstrip().rstrip()),
        ('mistral-large', mistral_large.lstrip().rstrip()), 
        ('mixtral-8x7b', mixtral_8_7b.lstrip().rstrip()),
        ('reka-flash', reka_flash.lstrip().rstrip()),
    ], 
    schema=["model", "prediction"])

df

# Getting Interactive ⚽

You can also include Streamlit Controls to make your notebook interactive:

In [None]:
prompt = """Based on historical performances and achievements in international and club football up to 2024, factoring in international players, and the location of the event, which club is most likely to win Eredivisie 2024/2025? Include historical references where possible, and call out specific players."""


st.subheader("Inputs:")

with st.container():
    with st.expander("Edit prompt and select LLM", expanded=True):
        
        new_prompt = st.text_area(label='Prompt:',value=prompt,height=100, label_visibility='collapsed')

        new_prompt += '. Answer in the style of a socccer journalist.'
        
        left_col,right_col = st.columns(2)
        
        with left_col:
            model_select = st.selectbox(
                'Select your Cortex Model:',
                ('snowflake-arctic','llama3-8b','llama3-70b','mistral-large','mixtral-8x7b','reka-flash'),
                label_visibility='collapsed'
            )
        
            with right_col:
                button = st.button('⚽ Predict! ⚽',type="primary")

with st.container():
    if button:
        sql = f"select snowflake.cortex.complete('{model_select}','{new_prompt}')"

        with st.spinner("In progress..."):
            response = session.sql(sql).collect()[0][0]
            st.subheader("Response:")
            st.write(response)