In [None]:
import sys
import os
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

In [None]:
# Haversine Formula
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c

    if km > 0.1: km = 0
    return km


In [None]:
import math
def calculate_metabolic_power(df):
    #Where EC is the energy cost of accelerated running on grass (in J · kg − 1 · m − 1 ),
    #Metabolic Energy (EC) = fn (ES) x EM x KT
    # fn = 155.4(ES)5 - 30.4(ES)4 - 43.3(ES)3 + 46.3(ES)2 + 19.5(ES) + 3.6 
    # EC = (155.4*ES^5 - 30.4*ES^4 - 43.3*ES^3 + 46.3*ES^2 + 19.5*ES +3.6) * EM * KT
    # KT = a fixed terrain constant of 1.29 to account for the extra energy required for the grass surface.
    #Metabolic power (P) can be then calculated multiplying EC by running speed (v): P = EC*V

    df = df[df['speed'] != 0]
    df = df.reset_index()

    g=float(9.8)
    KT = 1.29
    for i in range(len(df.index) -1):
        
        af=float(df.inst_acc_impulse[i])
        if af > 0:
            EM=(af**2 / g**2 + 1)**0.5
            ES=math.tan(90-math.atan(g/af))
            EC=(155.4*ES**5 - 30.4*ES**4 - 43.3*ES**3 + 46.3*ES**2 + 19.5*ES + 3.6) * EM * KT
            P = EC * df.speed[i]

            return np.mean(P)


In [None]:
from datetime import datetime

def find_duration(time_start, time_end):
    FMT = '%H:%M:%S.%f'
    duration = datetime.strptime(str(time_end), FMT) - datetime.strptime(time_start, FMT)
    hours, remainder = divmod(duration.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}"


In [None]:
def calc_sessions_opt (df, date, sessionId, playerName, teamName):
    
    total_distance = 0
    average_speed = 0
    top_speed = 0
    len_df = len(df.index) - 1
    
    for i in range(0, len_df):
        if i+1 < len_df:    

            #Current coords
            lat1 = df.lat[i]
            lon1 = df.lon[i]

            #Coords on next step
            lat2 = df.lat[i+1]
            lon2 = df.lon[i+1]

            #Calculate distance between latlon1 and laton2
            if lat1 != lat2 and lon1 != lon2:
                total_distance += haversine(lat1, lon1, lat2, lon2)

            current_speed = df.speed[i]
            if current_speed > 9.5:
                current_speed = 9.5
            #Used for calcualting average speed
            average_speed += current_speed
                
            #Find top speed
            top_speed = max(current_speed, top_speed)
            

    #Get mean metabolic power
    metabolic_power = calculate_metabolic_power(df)
    
    #Get average speed
    average_speed = average_speed / len_df

    #Get session duration
    try:
        duration = find_duration(df.iloc[0]['time'], df.iloc[-1]['time'])
    except:
        duration = 0
        print('Error in duration calculation', playerName, date)

    return(teamName, playerName, date, sessionId, duration, total_distance, average_speed, top_speed, metabolic_power)




In [None]:
days = [f"{i:02}" for i in range(1, 32)]
months = [f"{i:02}" for i in range(1, 13)]
years = ['2020', '2021']
teams = ['TeamA', 'TeamB']

#Path to SoccerMon Objecive folder
soccermon_path = ""

data_paths = [f"{soccermon_path}/{team}/{year}" for team in teams for year in years]
file_paths = []
for data_path in data_paths:
    for year in years:
        for month in months:
            for day in days:
                folder_path = os.path.join(data_path, f"{year}-{month}", f"{year}-{month}-{day}")
                file_paths.extend(
                    [str(file_path) for file_path in Path(folder_path).rglob("*.parquet")]
                )
              

data_frames = []
for parquet_path in file_paths:
    print(parquet_path)
    
    df = pd.read_parquet(parquet_path, engine="auto", columns=["time", "lat", "lon", "speed", "inst_acc_impulse", "player_name", "hacc", "hdop", "signal_quality"])
    
    #Only get every 10 row
    df = df.iloc[::10, :]

    # Filter out rows where lat = 0, lon = 0, hacc > 3, hdop > 10 and signal_quality < 100
    df = df[(df['lat'] != 0) & (df['lon'] != 0) & (df['hacc'] < 3) & (df['hdop'] < 10) & (df['signal_quality'] > 100)]
    df = df.reset_index()

    player_name = str(parquet_path)[76:].replace(".parquet", "")
    team_name = player_name[:5]
    date = parquet_path[65:75]
    session_id = parquet_path[65:].replace(".parquet", "")

    

    vals = calc_sessions_opt(df, date, session_id, player_name, team_name)
    print(vals)
    
    data_frame = pd.DataFrame(data=[vals], columns=[ "Team_name", "Player_name", "Date", "Session_Id", "Duration", "Total_distance", "Average_running_speed", "Top_speed", "Metabolic_power"])
    
    data_frames.append(data_frame)

if data_frames:
    result = pd.concat(data_frames, ignore_index=True)
    result.to_csv("Session.csv", mode='a', index=False)




In [None]:
from datetime import datetime
def find_duration(time_start, time_end):
    s1 = time_start
    s2 = time_end 
    FMT = '%H:%M:%S.%f'
    return (datetime.strptime(str(s2), FMT) - datetime.strptime((s1), FMT))

In [None]:
def calc_hir_opt(df, date, sessionId, playerName, teamName):
    hir_arr = []

    for i in range(len(df) - 1):
        current_speed = df.speed[i]


        if current_speed > 5.5 and (i == 0 or df.speed[i - 1] < 5.5):
            time_start = df.time[i]
            lat_start, lon_start = df.lat[i], df.lon[i]
            lat_end, lon_end = lat_start, lon_start
            avg_speed, tot_dist, top_speed, counter = 0, 0, 0, 0

            for j in range(i + 1, len(df)):
                lat_end, lon_end = df.lat[j], df.lon[j]
                tot_dist += haversine(lat_end, lon_end, df.lat[j - 1], df.lon[j - 1])
                
                current_speed = df.speed[j]

                avg_speed += current_speed
                top_speed = max(top_speed, current_speed)
                counter += 1
                
                
                if current_speed <= 5.5:
                    break
                
            avg_speed /= counter
            time_end = df.time[j - 1]
            duration = find_duration(time_start, time_end)

            if duration.seconds >= 1:
                temp = [date, sessionId, playerName, teamName, time_start, time_end, lat_start, lon_start, lat_end, lon_end, avg_speed, tot_dist*1000, top_speed, duration.total_seconds()]
                hir_arr.append(temp)

            i = j

    return hir_arr

In [None]:
""" Store all HIR """

days = [f"{i:02}" for i in range(1, 32)]
months = [f"{i:02}" for i in range(1, 13)]
years = ['2020', '2021']
teams = ['TeamA', 'TeamB']

#Path to SoccerMon Objecive folder
soccermon_path = ""

data_paths = [f"{soccermon_path}/{team}/{year}" for team in teams for year in years]
file_paths = []
for data_path in data_paths:
    for year in years:
        for month in months:
            for day in days:
                folder_path = os.path.join(data_path, f"{year}-{month}", f"{year}-{month}-{day}")
                file_paths.extend(
                    [str(file_path) for file_path in Path(folder_path).rglob("*.parquet")]
                )
              

data_frames = []
for parquet_path in file_paths:
    print(parquet_path)
    
    df = pd.read_parquet(parquet_path, engine="auto", columns=["time", "lat", "lon", "speed", "inst_acc_impulse", "player_name", "hacc", "hdop", "signal_quality"])
    
    #Only get every 10 row
    df = df.iloc[::10, :]

    #Cap speed at 9.5
    df['speed'] = df['speed'].clip(upper=9.5)

    # Filter out rows where lat = 0, lon = 0, speed = 0, hacc < 3, and hdop < 3
    df = df[(df['lat'] != 0) & (df['lon'] != 0) & (df['hacc'] < 3) & (df['hdop'] < 3) & (df['speed'] > 5) & (df['signal_quality'] > 200)]
    df = df.reset_index()

    player_name = str(parquet_path)[76:].replace(".parquet", "")
    team_name = player_name[:5]
    date = parquet_path[65:75]
    session_id = parquet_path[65:].replace(".parquet", "")

    

    
    print("hei")
    data_frame = pd.DataFrame(data=calc_hir_opt(df, date, session_id, player_name, team_name), columns=['Date', 'Session_Id', 'Player_name', 'Team_name', 'Start_time', 'End_time', 'Lat_start', 'Lon_start', 'Lat_end', 'Lon_end', 'Average_speed', 'Total_distance', 'Top_speed', 'Duration'])
    print(data_frame)
    data_frames.append(data_frame)

if data_frames:
    result = pd.concat(data_frames, ignore_index=True)
    result.to_csv("HIR_test.csv", index=False)
