# ScoreSight - Part 2: Data Cleaning

**Author:** Prathamesh Fuke  
**Branch:** Prathamesh_Fuke  
**Date:** October 28, 2025

## Objective
Clean the datasets by:
- Handling missing values
- Removing duplicates
- Fixing data type issues
- Handling outliers
- Standardizing column names
- Correcting inconsistencies

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
print("✓ Libraries imported")

In [None]:
# Load raw datasets from previous notebook
print("Loading raw datasets...")
match_data = pd.read_csv('../data/raw/data_raw_match.csv')
player_data = pd.read_csv('../data/raw/data_raw_player.csv')
league_data = pd.read_csv('../data/raw/data_raw_league.csv')
print(f"✓ Match data: {match_data.shape}")
print(f"✓ Player data: {player_data.shape}")
print(f"✓ League data: {league_data.shape}")

## 2. Clean Match Winner Dataset

In [None]:
print("="*80)
print("CLEANING MATCH WINNER DATASET")
print("="*80)

match_clean = match_data.copy()
print(f"\nOriginal shape: {match_clean.shape}")

In [None]:
# Remove duplicates
duplicates = match_clean.duplicated().sum()
print(f"\nDuplicates found: {duplicates}")
if duplicates > 0:
    match_clean = match_clean.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicates")
    print(f"New shape: {match_clean.shape}")

In [None]:
# Handle missing values
print("\nHandling missing values...")
missing_before = match_clean.isnull().sum().sum()
print(f"Total missing values: {missing_before}")

if missing_before > 0:
    # Identify numeric and categorical columns
    numeric_cols = match_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = match_clean.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with median
    for col in numeric_cols:
        if match_clean[col].isnull().sum() > 0:
            median_val = match_clean[col].median()
            match_clean[col].fillna(median_val, inplace=True)
            print(f"  ✓ Filled '{col}' with median: {median_val}")
    
    # Fill categorical columns with mode or 'Unknown'
    for col in categorical_cols:
        if match_clean[col].isnull().sum() > 0:
            if not match_clean[col].mode().empty:
                mode_val = match_clean[col].mode()[0]
                match_clean[col].fillna(mode_val, inplace=True)
                print(f"  ✓ Filled '{col}' with mode: {mode_val}")
            else:
                match_clean[col].fillna('Unknown', inplace=True)
                print(f"  ✓ Filled '{col}' with 'Unknown'")
    
    missing_after = match_clean.isnull().sum().sum()
    print(f"\n✓ Missing values reduced from {missing_before} to {missing_after}")

In [None]:
# Standardize column names (lowercase, replace spaces with underscores)
print("\nStandardizing column names...")
match_clean.columns = match_clean.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
print("✓ Column names standardized")
print(f"Columns: {list(match_clean.columns)}")

In [None]:
print(f"\n✓ Match data cleaned! Final shape: {match_clean.shape}")

## 3. Clean Player Data (Goals & Assists)

In [None]:
print("="*80)
print("CLEANING PLAYER DATA")
print("="*80)

player_clean = player_data.copy()
print(f"\nOriginal shape: {player_clean.shape}")

In [None]:
# Remove duplicates
duplicates = player_clean.duplicated().sum()
print(f"\nDuplicates found: {duplicates}")
if duplicates > 0:
    player_clean = player_clean.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicates")
    print(f"New shape: {player_clean.shape}")

In [None]:
# Handle missing values
print("\nHandling missing values...")
missing_before = player_clean.isnull().sum().sum()
print(f"Total missing values: {missing_before}")

if missing_before > 0:
    # For player stats, missing goals/assists likely means 0
    numeric_cols = player_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = player_clean.select_dtypes(include=['object']).columns
    
    # Fill numeric columns (goals, assists) with 0
    for col in numeric_cols:
        if player_clean[col].isnull().sum() > 0:
            player_clean[col].fillna(0, inplace=True)
            print(f"  ✓ Filled '{col}' with 0")
    
    # Fill categorical columns
    for col in categorical_cols:
        if player_clean[col].isnull().sum() > 0:
            if not player_clean[col].mode().empty:
                mode_val = player_clean[col].mode()[0]
                player_clean[col].fillna(mode_val, inplace=True)
                print(f"  ✓ Filled '{col}' with mode: {mode_val}")
            else:
                player_clean[col].fillna('Unknown', inplace=True)
                print(f"  ✓ Filled '{col}' with 'Unknown'")
    
    missing_after = player_clean.isnull().sum().sum()
    print(f"\n✓ Missing values reduced from {missing_before} to {missing_after}")

In [None]:
# Standardize column names
print("\nStandardizing column names...")
player_clean.columns = player_clean.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
print("✓ Column names standardized")
print(f"Columns: {list(player_clean.columns)}")

In [None]:
print(f"\n✓ Player data cleaned! Final shape: {player_clean.shape}")

## 4. Clean League Data

In [None]:
print("="*80)
print("CLEANING LEAGUE DATA")
print("="*80)

league_clean = league_data.copy()
print(f"\nOriginal shape: {league_clean.shape}")

In [None]:
# Remove duplicates
duplicates = league_clean.duplicated().sum()
print(f"\nDuplicates found: {duplicates}")
if duplicates > 0:
    league_clean = league_clean.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicates")
    print(f"New shape: {league_clean.shape}")

In [None]:
# Handle missing values
print("\nHandling missing values...")
missing_before = league_clean.isnull().sum().sum()
print(f"Total missing values: {missing_before}")

if missing_before > 0:
    numeric_cols = league_clean.select_dtypes(include=[np.number]).columns
    categorical_cols = league_clean.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with median
    for col in numeric_cols:
        if league_clean[col].isnull().sum() > 0:
            median_val = league_clean[col].median()
            league_clean[col].fillna(median_val, inplace=True)
            print(f"  ✓ Filled '{col}' with median: {median_val}")
    
    # Fill categorical columns
    for col in categorical_cols:
        if league_clean[col].isnull().sum() > 0:
            if not league_clean[col].mode().empty:
                mode_val = league_clean[col].mode()[0]
                league_clean[col].fillna(mode_val, inplace=True)
                print(f"  ✓ Filled '{col}' with mode: {mode_val}")
            else:
                league_clean[col].fillna('Unknown', inplace=True)
                print(f"  ✓ Filled '{col}' with 'Unknown'")
    
    missing_after = league_clean.isnull().sum().sum()
    print(f"\n✓ Missing values reduced from {missing_before} to {missing_after}")

In [None]:
# Standardize column names
print("\nStandardizing column names...")
league_clean.columns = league_clean.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
print("✓ Column names standardized")
print(f"Columns: {list(league_clean.columns)}")

In [None]:
print(f"\n✓ League data cleaned! Final shape: {league_clean.shape}")

## 5. Data Quality Verification

In [None]:
print("="*80)
print("DATA QUALITY VERIFICATION")
print("="*80)

datasets = {
    'Match Data': (match_data, match_clean),
    'Player Data': (player_data, player_clean),
    'League Data': (league_data, league_clean)
}

for name, (original, cleaned) in datasets.items():
    print(f"\n{name}:")
    print(f"  Original rows: {len(original):,} → Cleaned rows: {len(cleaned):,}")
    print(f"  Missing values: {original.isnull().sum().sum():,} → {cleaned.isnull().sum().sum():,}")
    print(f"  Duplicates: {original.duplicated().sum():,} → {cleaned.duplicated().sum():,}")

## 6. Save Cleaned Data

In [None]:
print("\nSaving cleaned datasets...")
match_clean.to_csv('../data/cleaned/data_cleaned_match.csv', index=False)
player_clean.to_csv('../data/cleaned/data_cleaned_player.csv', index=False)
league_clean.to_csv('../data/cleaned/data_cleaned_league.csv', index=False)
print("\n✓ All cleaned datasets saved!")
print("\n" + "="*80)
print("NOTEBOOK 02 COMPLETED - Ready for Feature Engineering")
print("="*80)