# ScoreSight - Part 5: Data Visualization

**Author:** Prathamesh Fuke  
**Branch:** Prathamesh_Fuke  
**Date:** October 28, 2025

## Objective
Visualize key patterns and insights:
- Match outcome distributions
- Team performance trends
- Player scoring patterns
- Feature correlations
- Historical trends

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

pd.set_option('display.max_columns', None)
print("✓ Libraries imported")

In [None]:
# Load encoded datasets
print("Loading datasets...")
match_data = pd.read_csv('data_encoded_match.csv')
player_data = pd.read_csv('data_encoded_player.csv')
league_data = pd.read_csv('data_encoded_league.csv')
print(f"✓ Match data: {match_data.shape}")
print(f"✓ Player data: {player_data.shape}")
print(f"✓ League data: {league_data.shape}")

## 2. Match Data Visualizations

In [None]:
print("="*80)
print("MATCH DATA VISUALIZATIONS")
print("="*80)

# Display available columns
print(f"\nAvailable columns: {list(match_data.columns)}")

In [None]:
# Correlation heatmap for match data
numeric_cols = match_data.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
    plt.figure(figsize=(14, 10))
    correlation_matrix = match_data[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
                linewidths=0.5, cbar_kws={'label': 'Correlation'})
    plt.title('Match Data - Feature Correlation Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('viz_match_correlation.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Correlation heatmap saved")

In [None]:
# Distribution plots for key numeric features
numeric_cols_sample = numeric_cols[:6] if len(numeric_cols) > 6 else numeric_cols
if len(numeric_cols_sample) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, col in enumerate(numeric_cols_sample):
        if idx < len(axes):
            axes[idx].hist(match_data[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution: {col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('viz_match_distributions.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Distribution plots saved")

## 3. Player Data Visualizations

In [None]:
print("="*80)
print("PLAYER DATA VISUALIZATIONS")
print("="*80)

# Display available columns
print(f"\nAvailable columns: {list(player_data.columns)}")

In [None]:
# Correlation heatmap for player data
numeric_cols_player = player_data.select_dtypes(include=[np.number]).columns
if len(numeric_cols_player) > 1:
    plt.figure(figsize=(12, 8))
    correlation_matrix = player_data[numeric_cols_player].corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='viridis', center=0,
                linewidths=0.5, cbar_kws={'label': 'Correlation'})
    plt.title('Player Data - Feature Correlation Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('viz_player_correlation.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Player correlation heatmap saved")

In [None]:
# Top performers visualization (if goal columns exist)
goal_cols = [col for col in player_data.columns if 'goal' in col.lower() and player_data[col].dtype in [np.int64, np.float64]]
if len(goal_cols) > 0:
    goal_col = goal_cols[0]
    top_scorers = player_data.nlargest(10, goal_col)
    
    plt.figure(figsize=(12, 6))
    plt.barh(range(len(top_scorers)), top_scorers[goal_col], color='skyblue', edgecolor='navy')
    plt.yticks(range(len(top_scorers)), [f"Player {i+1}" for i in range(len(top_scorers))])
    plt.xlabel('Goals', fontweight='bold')
    plt.title('Top 10 Scorers', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('viz_top_scorers.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Top scorers visualization saved")

## 4. League Data Visualizations

In [None]:
print("="*80)
print("LEAGUE DATA VISUALIZATIONS")
print("="*80)

# Display available columns
print(f"\nAvailable columns: {list(league_data.columns)}")

In [None]:
# League statistics visualization
numeric_cols_league = league_data.select_dtypes(include=[np.number]).columns
if len(numeric_cols_league) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Box plots for numeric features
    if len(numeric_cols_league) > 0:
        sample_cols = numeric_cols_league[:5] if len(numeric_cols_league) > 5 else numeric_cols_league
        league_data[sample_cols].boxplot(ax=axes[0])
        axes[0].set_title('League Statistics - Box Plots', fontweight='bold')
        axes[0].set_ylabel('Values')
        axes[0].tick_params(axis='x', rotation=45)
    
    # Descriptive statistics
    if len(numeric_cols_league) > 0:
        stats = league_data[numeric_cols_league].describe().T
        axes[1].axis('tight')
        axes[1].axis('off')
        table = axes[1].table(cellText=stats.values, colLabels=stats.columns,
                             rowLabels=stats.index, cellLoc='center', loc='center')
        table.auto_set_font_size(False)
        table.set_fontsize(8)
        table.scale(1, 2)
        axes[1].set_title('Summary Statistics', fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.savefig('viz_league_statistics.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ League statistics visualization saved")

## 5. Summary Visualization

In [None]:
# Create summary dashboard
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Dataset sizes
ax1 = fig.add_subplot(gs[0, :])
datasets = ['Match Data', 'Player Data', 'League Data']
sizes = [match_data.shape[0], player_data.shape[0], league_data.shape[0]]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
ax1.bar(datasets, sizes, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_ylabel('Number of Records', fontweight='bold')
ax1.set_title('ScoreSight - Dataset Overview', fontsize=16, fontweight='bold')
for i, v in enumerate(sizes):
    ax1.text(i, v + max(sizes)*0.02, f'{v:,}', ha='center', fontweight='bold')

# Feature counts
ax2 = fig.add_subplot(gs[1, 0])
feature_counts = [match_data.shape[1], player_data.shape[1], league_data.shape[1]]
ax2.pie(feature_counts, labels=datasets, autopct='%1.1f%%', colors=colors, startangle=90)
ax2.set_title('Feature Distribution', fontweight='bold')

# Data completeness
ax3 = fig.add_subplot(gs[1, 1])
completeness = [
    (1 - match_data.isnull().sum().sum() / (match_data.shape[0] * match_data.shape[1])) * 100,
    (1 - player_data.isnull().sum().sum() / (player_data.shape[0] * player_data.shape[1])) * 100,
    (1 - league_data.isnull().sum().sum() / (league_data.shape[0] * league_data.shape[1])) * 100
]
ax3.barh(datasets, completeness, color=colors, edgecolor='black')
ax3.set_xlabel('Completeness (%)', fontweight='bold')
ax3.set_title('Data Completeness', fontweight='bold')
ax3.set_xlim(0, 100)
for i, v in enumerate(completeness):
    ax3.text(v + 1, i, f'{v:.1f}%', va='center', fontweight='bold')

# Memory usage
ax4 = fig.add_subplot(gs[1, 2])
memory_usage = [
    match_data.memory_usage(deep=True).sum() / 1024**2,
    player_data.memory_usage(deep=True).sum() / 1024**2,
    league_data.memory_usage(deep=True).sum() / 1024**2
]
ax4.bar(datasets, memory_usage, color=colors, edgecolor='black')
ax4.set_ylabel('Memory (MB)', fontweight='bold')
ax4.set_title('Memory Usage', fontweight='bold')
for i, v in enumerate(memory_usage):
    ax4.text(i, v + max(memory_usage)*0.02, f'{v:.1f}', ha='center', fontweight='bold')

# Summary text
ax5 = fig.add_subplot(gs[2, :])
ax5.axis('off')
summary_text = f"""
DATA PREPROCESSING SUMMARY
{'='*80}

Total Records Processed: {match_data.shape[0] + player_data.shape[0] + league_data.shape[0]:,}
Total Features Created: {match_data.shape[1] + player_data.shape[1] + league_data.shape[1]}
Overall Data Completeness: {np.mean(completeness):.2f}%

Ready for Model Training:
  ✓ Match Outcome Prediction
  ✓ Top Scorer Prediction  
  ✓ Points Tally Prediction
"""
ax5.text(0.5, 0.5, summary_text, ha='center', va='center', fontsize=11,
         family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

plt.savefig('viz_summary_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()
print("\n✓ Summary dashboard saved")

## 6. Export Visualization Summary

In [None]:
print("\n" + "="*80)
print("VISUALIZATION SUMMARY")
print("="*80)
print("\nGenerated visualizations:")
print("  1. viz_match_correlation.png - Match feature correlations")
print("  2. viz_match_distributions.png - Match feature distributions")
print("  3. viz_player_correlation.png - Player feature correlations")
print("  4. viz_top_scorers.png - Top 10 scorers")
print("  5. viz_league_statistics.png - League statistics overview")
print("  6. viz_summary_dashboard.png - Complete data summary")
print("\n✓ All visualizations saved successfully!")
print("\n" + "="*80)
print("NOTEBOOK 05 COMPLETED - Data Visualization Finished")
print("="*80)