In [22]:
import pandas as pd
import re

# Load the CSV file
file_path = 'markstatsbot_lobstr - markrstats_data.csv'
df = pd.read_csv(file_path)

# Function to extract relevant information from the 'content' column
def extract_game_data(content):
    game_data = {}
    
    # Extract teams and score
    match = re.search(r'([A-Za-z ]+)\s(\d+)\s:\s(\d+)\s([A-Za-z ]+)', content)
    if match:
        game_data['Home Team'] = match.group(1).strip()
        game_data['Home Score'] = int(match.group(2))
        game_data['Away Score'] = int(match.group(3))
        game_data['Away Team'] = match.group(4).strip()
    
    # Extract stats using regex
    stats_patterns = {
        'xG': r'xG:\s([\d\.]+)\s-\s([\d\.]+)',
        'Pass directness': r'Pass directness:\s([\d\.]+)\s-\s([\d\.]+)',
        'Penbox shots': r'Penbox shots:\s(\d+)\s-\s(\d+)',
        'xThreat': r'xThreat:\s([\d\.]+)\s-\s([\d\.]+)',
        'Possession': r'Possession:\s([\d\.]+)%\s-\s([\d\.]+)%',
        'Field Tilt': r'Field Tilt:\s([\d\.]+)%\s-\s([\d\.]+)%',
        'Def Action Height': r'Def Action Height:\s([\d\.]+)\s-\s([\d\.]+)',
        '10+ Pass Sequences': r'10\+\s+pass sequences:\s(\d+)\s-\s(\d+)',
        'Goal Kick Length': r'Goalkick length:\s([\d\.]+)\s-\s([\d\.]+)',
        'Goal Kick Progression': r'Goalkick progr\.:\s([\d\.]+)\s-\s([\d\.]+)'
    }
    
    for stat, pattern in stats_patterns.items():
        match = re.search(pattern, content)
        if match:
            game_data[f'Home {stat}'] = float(match.group(1))
            game_data[f'Away {stat}'] = float(match.group(2))
    
    return game_data

# Apply the extraction function to each row
games_data = [extract_game_data(content) for content in df['content']]

# Convert the list of dictionaries to a DataFrame
games_df = pd.DataFrame(games_data)

# Remove any rows that are completely empty
games_df.dropna(how='all', inplace=True)

# Check for the presence of the mentioned columns before combining
required_columns = [
    'Home xG', 'Away xG', 'Home xThreat', 'Away xThreat', 
    'Home Pass directness', 'Away Pass directness', 'Home Penbox shots', 
    'Away Penbox shots', 'Home Possession', 'Away Possession', 
    'Home Field Tilt', 'Away Field Tilt', 'Home Def Action Height', 
    'Away Def Action Height', 'Home 10+ Pass Sequences', 'Away 10+ Pass Sequences',
    'Home Goal Kick Length', 'Away Goal Kick Length', 'Home Goal Kick Progression',
    'Away Goal Kick Progression'
]

# Ensure all required columns are present in the DataFrame
for col in required_columns:
    if col not in games_df.columns:
        games_df[col] = None

# Combine rows corresponding to the same game
# Group by the home and away teams and scores, then aggregate the stats
combined_games_df = games_df.groupby(
    ['Home Team', 'Home Score', 'Away Score', 'Away Team'],
    as_index=False
).agg({
    'Home xG': 'max',
    'Away xG': 'max',
    'Home xThreat': 'max',
    'Away xThreat': 'max',
    'Home Pass directness': 'max',
    'Away Pass directness': 'max',
    'Home Penbox shots': 'max',
    'Away Penbox shots': 'max',
    'Home Possession': 'max',
    'Away Possession': 'max',
    'Home Field Tilt': 'max',
    'Away Field Tilt': 'max',
    'Home Def Action Height': 'max',
    'Away Def Action Height': 'max',
    'Home 10+ Pass Sequences': 'max',
    'Away 10+ Pass Sequences': 'max',
    'Home Goal Kick Length': 'max',
    'Away Goal Kick Length': 'max',
    'Home Goal Kick Progression': 'max',
    'Away Goal Kick Progression': 'max'
})

# Save the new DataFrame to a CSV file
output_path = 'reformatted_games_data.csv'
combined_games_df.to_csv(output_path, index=False)

print(f"Reformatted data saved to {output_path}")

Reformatted data saved to reformatted_games_data.csv


In [24]:
df_complete = combined_games_df.dropna()