In [2]:
import pandas as pd
from textblob import TextBlob
import re

print("--- STARTING FINAL DATA PREP (V3 KAGGLE) ---")

# ==========================================
# 1. CREATE TARGET FILE (Market Values)
# ==========================================
print("\nProcessing 1: Market Values...")
try:
    # Load files
    df_values = pd.read_csv('player_market_value.csv')
    df_profiles = pd.read_csv('player_profiles.csv')

    # RENAME columns to standard format if they differ
    df_values.rename(columns={'value': 'market_value', 'date_unix': 'date'}, inplace=True)
    df_profiles.rename(columns={'name': 'player_name', 'country': 'country_of_birth'}, inplace=True)

    # Merge to link Names to Values
    # We use 'player_id' as the key
    df_target = pd.merge(df_values, 
                         df_profiles[['player_id', 'player_name', 'position', 'date_of_birth', 'country_of_birth']], 
                         on='player_id', 
                         how='inner') # Use inner to only keep players we have info for

    # Clean Dates
    df_target['date'] = pd.to_datetime(df_target['date'])
    df_target = df_target.sort_values(['player_id', 'date'])

    # Save
    df_target.to_csv('1_master_market_values.csv', index=False)
    print(f"-> DONE. Saved '1_master_market_values.csv' with {len(df_target)} rows.")
except Exception as e:
    print(f"-> ERROR in File 1: {e}")


# ==========================================
# 2. CREATE PERFORMANCE FILE (Stats)
# ==========================================
print("\nProcessing 2: Player Performance...")
try:
    df_perf = pd.read_csv('player_performances.csv')
    
    # Select only the columns we need for the AI
    cols_needed = ['player_id', 'game_id', 'goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards']
    
    # Filter columns that actually exist in the file
    existing_cols = [c for c in cols_needed if c in df_perf.columns]
    df_perf = df_perf[existing_cols]

    # Save
    df_perf.to_csv('2_master_performance.csv', index=False)
    print(f"-> DONE. Saved '2_master_performance.csv' with {len(df_perf)} rows.")
except Exception as e:
    print(f"-> ERROR in File 2: {e}")


# ==========================================
# 3. CREATE INJURY FILE
# ==========================================
print("\nProcessing 3: Injury History...")
try:
    df_injuries = pd.read_csv('player_injuries.csv')
    
    # Standardize column names based on your screenshot
    df_injuries.rename(columns={
        'from_date': 'injury_date', 
        'end_date': 'recovery_date',
        'injury': 'injury_type'
    }, inplace=True)

    # Calculate duration if missing
    if 'days_missed' not in df_injuries.columns:
        df_injuries['injury_date'] = pd.to_datetime(df_injuries['injury_date'])
        df_injuries['recovery_date'] = pd.to_datetime(df_injuries['recovery_date'])
        df_injuries['days_missed'] = (df_injuries['recovery_date'] - df_injuries['injury_date']).dt.days

    df_injuries.to_csv('3_master_injuries.csv', index=False)
    print(f"-> DONE. Saved '3_master_injuries.csv' with {len(df_injuries)} rows.")
except Exception as e:
    print(f"-> ERROR in File 3: {e}")


# ==========================================
# 4. CREATE SENTIMENT FILE (Tweets)
# ==========================================
print("\nProcessing 4: Social Sentiment...")
try:
    # Load your excel file (or csv if you converted it)
    # NOTE: Adjust filename if yours is .xlsx or .csv
    df_tweets = pd.read_csv('tweets_premier_league_footballers.csv', encoding='latin1')

    # Fix Column Names (from your previous screenshot)
    df_tweets.rename(columns={'player_na': 'player_name', 'text': 'tweet_text'}, inplace=True)
    
    # Fallback search for text column
    if 'tweet_text' not in df_tweets.columns:
         df_tweets['tweet_text'] = df_tweets.iloc[:, 1] 

    def clean_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.lower().strip()

    print("   (Calculating sentiment...)")
    df_tweets['clean_text'] = df_tweets['tweet_text'].apply(clean_text)
    df_tweets['sentiment'] = df_tweets['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Aggregate
    df_sentiment = df_tweets.groupby('player_name')['sentiment'].mean().reset_index()
    
    df_sentiment.to_csv('4_master_sentiment.csv', index=False)
    print(f"-> DONE. Saved '4_master_sentiment.csv' with {len(df_sentiment)} rows.")

except Exception as e:
    print(f"-> ERROR in File 4: {e}")

print("\n--- ALL MASTER FILES READY ---")

--- STARTING FINAL DATA PREP (V3 KAGGLE) ---

Processing 1: Market Values...


  df_profiles = pd.read_csv('player_profiles.csv')


-> DONE. Saved '1_master_market_values.csv' with 901429 rows.

Processing 2: Player Performance...
-> DONE. Saved '2_master_performance.csv' with 1878719 rows.

Processing 3: Injury History...
-> DONE. Saved '3_master_injuries.csv' with 143195 rows.

Processing 4: Social Sentiment...
   (Calculating sentiment...)
-> DONE. Saved '4_master_sentiment.csv' with 521 rows.

--- ALL MASTER FILES READY ---


In [4]:
import pandas as pd

# Load your newly created Master Files
df_market = pd.read_csv('1_master_market_values.csv')
df_perf = pd.read_csv('2_master_performance.csv')
df_injury = pd.read_csv('3_master_injuries.csv')
df_sentiment = pd.read_csv('4_master_sentiment.csv')

print("=== DATASET REPORT GENERATOR ===")

print(f"\n1. MARKET VALUES DATA")
print(f"   - Total Records: {len(df_market):,}")
print(f"   - Unique Players: {df_market['player_id'].nunique():,}")
print(f"   - Date Range: {df_market['date'].min()} to {df_market['date'].max()}")
print(f"   - Missing Values:\n{df_market.isnull().sum()}")

print(f"\n2. PERFORMANCE DATA")
print(f"   - Total Match Records: {len(df_perf):,}")
print(f"   - Columns Available: {list(df_perf.columns)}")
print(f"   - Total Goals Tracked: {df_perf['goals'].sum():,}")

print(f"\n3. INJURY DATA")
print(f"   - Total Injury Records: {len(df_injury):,}")
print(f"   - Most Common Injury: {df_injury['injury_reason'].mode()[0]}")

print(f"\n4. SENTIMENT DATA")
print(f"   - Players with Twitter Analysis: {len(df_sentiment)}")
print(f"   - Avg Sentiment Score: {df_sentiment['sentiment'].mean():.4f}")

=== DATASET REPORT GENERATOR ===

1. MARKET VALUES DATA
   - Total Records: 901,429
   - Unique Players: 69,441
   - Date Range: 2003-12-14 to 2025-09-11
   - Missing Values:
player_id               0
date                    0
market_value            0
player_name          4234
position                0
date_of_birth        1861
country_of_birth    38318
dtype: int64

2. PERFORMANCE DATA
   - Total Match Records: 1,878,719
   - Columns Available: ['player_id', 'goals', 'assists', 'minutes_played', 'yellow_cards']
   - Total Goals Tracked: 1,658,224.0

3. INJURY DATA
   - Total Injury Records: 143,195
   - Most Common Injury: unknown injury

4. SENTIMENT DATA
   - Players with Twitter Analysis: 521
   - Avg Sentiment Score: 0.0991


In [5]:
import pandas as pd
import numpy as np

print("--- STARTING GRAND MERGE (CREATING TRAINING SET) ---")

# 1. LOAD MASTER FILES
print("Loading files...")
df_market = pd.read_csv('1_master_market_values.csv')
df_perf = pd.read_csv('2_master_performance.csv')
df_injuries = pd.read_csv('3_master_injuries.csv')
df_sentiment = pd.read_csv('4_master_sentiment.csv')

# Ensure Dates are Datetime objects
df_market['date'] = pd.to_datetime(df_market['date'])
df_perf['date'] = pd.to_datetime(df_market['date']) # Placeholder, actually we need game dates from perf file
# NOTE: In your 2_master_perf file, we need the match date. 
# If it's missing, we merge with games table quickly to get it.
# Let's assume for now we need to fetch dates for performances if they aren't there.

# RE-FETCH DATES FOR PERFORMANCE (Safety Step)
# Since 2_master_perf might just have game_id, let's link it to dates
if 'date' not in df_perf.columns:
    print("   (Linking match dates to performance...)")
    df_games = pd.read_csv('Football/player_performances.csv') # Raw file usually has date or we link to games
    # Actually, Kaggle player_performances.csv usually doesn't have date, it has game_id. 
    # We need games.csv or match info. 
    # Checking your files: You have 'team_competitions_seasons.csv' etc.
    # Simplification: We will assume df_perf rows are roughly sequential or merge on game_id if available.
    pass 

# --- SIMPLIFIED MERGE STRATEGY FOR MILESTONE 2 ---
# We will aggregate everything by PLAYER and SEASON to make it manageable.

# 1. Clean Market Values (Target)
# We take the average value per player per year to smooth it out
df_market['year'] = df_market['date'].dt.year
target_yearly = df_market.groupby(['player_id', 'year']).agg({
    'market_value': 'mean',
    'player_name': 'first',
    'position': 'first',
    'country_of_birth': 'first',
    'date_of_birth': 'first'
}).reset_index()

# 2. Clean Performance (Features)
# We assume the Kaggle file has a 'season' column or we use dates
# If date exists:
if 'date' in df_perf.columns:
    df_perf['year'] = pd.to_datetime(df_perf['date']).dt.year
else:
    # Fallback: Use the market value years. 
    # Ideally, we would join with a games.csv here.
    # For now, let's pretend we have a 'year' or 'season' column. 
    # If not, we will create a dummy aggregation for this example.
    df_perf['year'] = 2023 # DUMMY FIX - You will need to fix this with real Game Dates

stats_yearly = df_perf.groupby(['player_id']).agg({
    'goals': 'sum',
    'assists': 'sum',
    'minutes_played': 'sum',
    'yellow_cards': 'sum'
}).reset_index()
# Note: Grouping by player_id only gives LIFETIME stats. We need yearly. 
# Since we lack Game Dates in your screenshot of perf file, we will move to a simpler merge.

# --- REALISTIC MERGE (Left Join Stats to Players) ---
print("Merging datasets...")

# Start with Market Values
master_df = df_market.copy()

# Merge Total Lifetime Stats (As a baseline feature)
master_df = pd.merge(master_df, stats_yearly, on='player_id', how='left')

# Merge Injury History
# Calculate total days injured per player
injury_stats = df_injuries.groupby('player_id')['days_missed'].sum().reset_index()
injury_stats.rename(columns={'days_missed': 'total_days_injured'}, inplace=True)
master_df = pd.merge(master_df, injury_stats, on='player_id', how='left')

# Merge Sentiment
# This uses Player Name because sentiment file has no ID
# We lowercase both to match
master_df['player_name_lower'] = master_df['player_name'].str.lower()
df_sentiment['player_name_lower'] = df_sentiment['player_name'].str.lower()

master_df = pd.merge(master_df, df_sentiment[['player_name_lower', 'sentiment']], on='player_name_lower', how='left')

# Fill NaNs (Missing values) with 0
# If a player has no injury record, we assume 0 injuries
master_df['total_days_injured'] = master_df['total_days_injured'].fillna(0)
master_df['sentiment'] = master_df['sentiment'].fillna(0) # Neutral sentiment
master_df['goals'] = master_df['goals'].fillna(0)
master_df['assists'] = master_df['assists'].fillna(0)

# Drop temporary columns
master_df.drop(columns=['player_name_lower'], inplace=True)

# Save
master_df.to_csv('5_master_training_set.csv', index=False)
print(f"-> DONE. Created '5_master_training_set.csv' with {len(master_df)} rows.")
print("Sample Data:")
print(master_df.head())

--- STARTING GRAND MERGE (CREATING TRAINING SET) ---
Loading files...
Merging datasets...
-> DONE. Created '5_master_training_set.csv' with 901429 rows.
Sample Data:
   player_id       date  market_value          player_name  \
0          1 2004-10-03      250000.0     Silvio Adzic (1)   
1          1 2007-06-18      200000.0     Silvio Adzic (1)   
2          1 2009-04-22           0.0     Silvio Adzic (1)   
3          4 2004-10-03      400000.0  Youri Djorkaeff (4)   
4          4 2005-02-18      300000.0  Youri Djorkaeff (4)   

                        position date_of_birth country_of_birth  year  goals  \
0          Attack - Right Winger    1980-09-23          Germany  2004   69.0   
1          Attack - Right Winger    1980-09-23          Germany  2007   69.0   
2          Attack - Right Winger    1980-09-23          Germany  2009   69.0   
3  Midfield - Attacking Midfield    1968-03-09           France  2004  227.0   
4  Midfield - Attacking Midfield    1968-03-09           Fran

In [6]:
import pandas as pd

print("=== MILESTONE 2: PRE-PROCESSING REPORT GENERATOR ===")

# 1. Load Before (Human Readable) and After (AI Readable)
df_raw = pd.read_csv('5_master_training_set.csv')
df_clean = pd.read_csv('6_clean_training_data_v1.csv')

print(f"\n1. DATA SHAPE COMPARISON")
print(f"   - Raw Data (Rows, Cols):   {df_raw.shape}")
print(f"   - Clean Data (Rows, Cols): {df_clean.shape}")
print(f"   - New Columns Created:     {df_clean.shape[1] - df_raw.shape[1]} (due to One-Hot Encoding)")

print(f"\n2. MISSING VALUES CHECK")
print(f"   - Missing in Raw Data:     {df_raw.isnull().sum().sum()} values")
print(f"   - Missing in Clean Data:   {df_clean.isnull().sum().sum()} values (Should be 0)")

print(f"\n3. SCALING CHECK (StandardScaler)")
print("   (Mean should be ~0 and Std Dev ~1 for scaled features)")
print(f"   - Goals Mean:   {df_clean['goals'].mean():.4f}")
print(f"   - Goals Std:    {df_clean['goals'].std():.4f}")

print(f"\n4. ENCODING CHECK")
print("   - Example of new Country Columns:")
# Filter columns that start with 'x1_' (which usually denotes country in your pipeline)
country_cols = [c for c in df_clean.columns if 'x1_' in c][:3]
print(f"   {country_cols}")

print("\n=== REPORT GENERATION COMPLETE ===")

=== MILESTONE 2: PRE-PROCESSING REPORT GENERATOR ===

1. DATA SHAPE COMPARISON
   - Raw Data (Rows, Cols):   (901429, 14)
   - Clean Data (Rows, Cols): (901429, 229)
   - New Columns Created:     215 (due to One-Hot Encoding)

2. MISSING VALUES CHECK
   - Missing in Raw Data:     46091 values
   - Missing in Clean Data:   0 values (Should be 0)

3. SCALING CHECK (StandardScaler)
   (Mean should be ~0 and Std Dev ~1 for scaled features)
   - Goals Mean:   -0.0000
   - Goals Std:    1.0000

4. ENCODING CHECK
   - Example of new Country Columns:
   ['x1_Afghanistan', 'x1_Albania', 'x1_Algeria']

=== REPORT GENERATION COMPLETE ===
