In [28]:
from settings import PROCESSED_DATA_DIR
import pandas as pd
from pathlib import Path
from reduce_memory_usage import reduce_memory_usage
from sklearn.metrics import precision_score
import numpy as np

In [29]:
df = pd.read_csv(Path(PROCESSED_DATA_DIR, 'bundesliga_matches.csv'))
df = reduce_memory_usage(df)
df['info_date'] = pd.to_datetime(df['info_date'])
df = df.sort_values(by=['info_date'])

Memory usage of dataframe is 4.627826690673828 MB
Memory usage of dataframe after reduction 1.2287025451660156 MB
Reduced by 73.44968540757709 % 


In [30]:
def predict(data, cutoff_date, model):
    """
    Make predictions using the random forest classifier.

    :param data: the dataframe to use
    :param cutoff_date: the date to use for splitting the data into train and test sets
    :param model: 
    :return: combined: a dataframe containing the actual and predicted values
    """
    data = data.copy()
    # Pick the predictor columns.
    predictors = [c for c in data.columns if c.startswith('feat_')]

    train_set = data[data['info_date'] < cutoff_date]
    test_set = data[(data['info_date'] >= cutoff_date)]

    # Fit (train) the model.
    model.fit(train_set[predictors], train_set['target'])

    # Make predictions on the test dataset and calculate the precision score.
    predictions = model.predict(test_set[predictors])

    # Create a dataframe containing the actual and predicted values.
    combined = pd.DataFrame(
        dict(actual=test_set['target'], prediction=predictions),
        index=test_set.index
    )

    return combined

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier.
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=10,
    random_state=1
)
combined_df = predict(df, '2023-12-07', model)

In [36]:
temp_df = df.copy().merge(
    combined_df,
    left_index=True,
    right_index=True
)
# Merge with itself to get the predictions of both teams in the same row.
temp_df = temp_df.merge(
    temp_df, left_on=['info_date', 'info_team'], right_on=['info_date', 'info_opponent']
)
precision_df = temp_df[(temp_df['prediction_x'] == 1) & (temp_df['prediction_y'] == 0)]
precision_df[['info_match_id_x', 'prediction_x', 'info_venue_x']]

Unnamed: 0,info_match_id_x,prediction_x,info_venue_x
5,2023-12-09_Dortmund_RB Leipzig,1,Away


In [49]:
from settings import ROOT_DIR
import codecs
import datetime
from jinja2 import Template

# Load the template.
with open(Path(ROOT_DIR, 'notebooks/template.md')) as f:
    template = Template(f.read())
rendered_file = template.render(
    date_today=datetime.datetime.now().strftime('%Y-%m-%d'),
    matches=precision_df[['info_team_x', 'info_opponent_x', 'info_venue_x', 'info_date']].to_dict('records')
)

output_file = codecs.open("report.md", "w", "utf-8")
output_file.write(rendered_file)
output_file.close()