In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
sns.set_style("whitegrid")

# Load the data
df = pd.read_csv('complete_dataset.csv')

print(f"Dataset loaded with {len(df)} games and {len(df.columns)} columns")

# Create output directory
import os
if not os.path.exists('plots'):
    os.makedirs('plots')
    print("Created 'plots' directory for saving visualizations")

# Helper function for categorical analysis
def count_items(df, column_name):
    """Extract and count items from comma-separated string columns"""
    if column_name not in df.columns:
        print(f"Warning: Column '{column_name}' not found")
        return Counter()
    
    all_items = []
    for item_list in df[column_name].dropna():
        if pd.isna(item_list) or str(item_list).strip() == '':
            continue
        clean_list = str(item_list).strip()
        items = [item.strip() for item in clean_list.split(',') if item.strip()]
        all_items.extend(items)
    return Counter(all_items)


# Check available columns
available_columns = df.columns.tolist()
print(f"Available columns: {available_columns}")

# --- Numerical Features Distribution ---
print("\n1. Creating numerical feature distributions...")

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns found: {numerical_cols}")

if numerical_cols:
    # Filter to most important numerical columns
    key_numerical = [col for col in ['average_rating', 'average_weight', 'users_rated', 'playing_time', 
                                   'min_players', 'max_players', 'min_age', 'year_published'] 
                    if col in numerical_cols]
    
    if key_numerical:
        n_cols = 3
        n_rows = (len(key_numerical) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
        
        # Handle single row case
        if n_rows == 1:
            axes = [axes] if n_cols == 1 else axes
        else:
            axes = axes.flatten()
        
        for i, col in enumerate(key_numerical):
            # Clean data for visualization
            data = df[col].dropna()
            
            # Remove extreme outliers for better visualization
            if len(data) > 0:
                Q1 = data.quantile(0.05)
                Q3 = data.quantile(0.95)
                filtered_data = data[(data >= Q1) & (data <= Q3)]
                
                axes[i].hist(filtered_data, bins=30, alpha=0.7, edgecolor='black')
                axes[i].set_title(f'Distribution of {col.replace("_", " ").title()}')
                axes[i].set_xlabel(col.replace("_", " ").title())
                axes[i].set_ylabel('Frequency')
                axes[i].grid(True, alpha=0.3)
        
        # Remove empty subplots
        for i in range(len(key_numerical), len(axes)):
            if i < len(axes):
                fig.delaxes(axes[i])
        
        plt.tight_layout()
        plt.savefig('plots/numerical_distributions.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("Saved numerical_distributions.png")


Dataset loaded with 7575 games and 23 columns
Available columns: ['id', 'name', 'year_published', 'min_players', 'max_players', 'playing_time', 'min_play_time', 'max_play_time', 'min_age', 'description', 'categories', 'mechanisms', 'designers', 'artists', 'publishers', 'users_rated', 'average_rating', 'bayes_average', 'num_comments', 'num_weights', 'average_weight', 'bgg_rank', 'category_ranks']

1. Creating numerical feature distributions...
Numerical columns found: ['id', 'year_published', 'min_players', 'max_players', 'playing_time', 'min_play_time', 'max_play_time', 'min_age', 'users_rated', 'average_rating', 'bayes_average', 'num_comments', 'num_weights', 'average_weight', 'bgg_rank']
Saved numerical_distributions.png


In [2]:
# --- Correlation Analysis ---

if len(numerical_cols) >= 2:
    # Select columns with sufficient data
    corr_data = df[numerical_cols].select_dtypes(include=[np.number])
    
    # Remove columns with too many missing values
    corr_data = corr_data.dropna(axis=1, thresh=len(corr_data)*0.5)
    
    if len(corr_data.columns) >= 2:
        correlation_matrix = corr_data.corr()
        
        plt.figure(figsize=(10, 8))
        
        # Create mask for upper triangle
        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
        
        # Generate heatmap
        sns.heatmap(correlation_matrix, 
                   annot=True, 
                   cmap='RdBu_r', 
                   center=0,
                   fmt='.2f', 
                   linewidths=0.5,
                   mask=mask,
                   square=True)
        
        plt.title('Correlation Matrix of Numerical Features')
        plt.tight_layout()
        plt.savefig('plots/correlation_heatmap.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("Saved correlation_heatmap.png")

Saved correlation_heatmap.png


In [3]:
# --- Categorical Analysis ---

categorical_columns = ['categories', 'mechanisms', 'designers', 'artists', 'publishers']

for col in categorical_columns:
    if col in df.columns:
        print(f"   Processing {col}...")
        
        item_counts = count_items(df, col)
        
        if len(item_counts) > 0:
            top_10 = item_counts.most_common(10)
            
            if top_10:
                items, counts = zip(*top_10)
                
                plt.figure(figsize=(12, 8))
                
                # Create horizontal bar chart
                y_positions = range(len(items))
                bars = plt.barh(y_positions, counts, color=plt.cm.viridis(np.linspace(0, 1, len(items))))
                
                plt.yticks(y_positions, items)
                plt.xlabel('Number of Games')
                plt.ylabel(col.replace('_', ' ').title())
                plt.title(f'Top 10 {col.replace("_", " ").title()}')
                
                # Add value labels
                for i, (bar, count) in enumerate(zip(bars, counts)):
                    plt.text(count + max(counts)*0.01, i, str(count), 
                            va='center', fontweight='bold')
                
                # Invert y-axis to show highest values at top
                plt.gca().invert_yaxis()
                plt.tight_layout()
                plt.savefig(f'plots/top_10_{col}.png', dpi=300, bbox_inches='tight')
                plt.close()
                print(f"Saved top_10_{col}.png")



   Processing categories...
Saved top_10_categories.png
   Processing mechanisms...
Saved top_10_mechanisms.png
   Processing designers...
Saved top_10_designers.png
   Processing artists...
Saved top_10_artists.png
   Processing publishers...
Saved top_10_publishers.png


In [4]:
# --- Rating Analysis ---

if 'average_rating' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Rating distribution
    rating_data = df['average_rating'].dropna()
    axes[0, 0].hist(rating_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Distribution of Average Ratings')
    axes[0, 0].set_xlabel('Average Rating')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(rating_data.mean(), color='red', linestyle='--', 
                      label=f'Mean: {rating_data.mean():.2f}')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Rating vs Users (if available)
    if 'users_rated' in df.columns:
        users_data = df['users_rated'].dropna()
        # Filter extreme outliers for better visualization
        users_95th = users_data.quantile(0.95)
        filtered_df = df[(df['users_rated'] <= users_95th) & 
                        (df['average_rating'].notna()) & 
                        (df['users_rated'].notna())]
        
        axes[0, 1].scatter(filtered_df['users_rated'], filtered_df['average_rating'], 
                          alpha=0.6, s=20, color='green')
        axes[0, 1].set_title('Rating vs Number of Users Rated')
        axes[0, 1].set_xlabel('Number of Users Rated')
        axes[0, 1].set_ylabel('Average Rating')
        axes[0, 1].grid(True, alpha=0.3)
    
    # Rating by year (if available)
    if 'year_published' in df.columns:
        year_data = df[(df['year_published'] >= 1990) & (df['year_published'] <= 2023)]
        if len(year_data) > 0:
            yearly_ratings = year_data.groupby('year_published')['average_rating'].mean()
            
            axes[1, 0].plot(yearly_ratings.index, yearly_ratings.values, marker='o', linewidth=2)
            axes[1, 0].set_title('Average Rating by Year Published')
            axes[1, 0].set_xlabel('Year Published')
            axes[1, 0].set_ylabel('Average Rating')
            axes[1, 0].grid(True, alpha=0.3)
    
    # Rating categories
    rating_bins = [0, 6, 7, 8, 10]
    rating_labels = ['Below 6', '6-7', '7-8', '8+']
    df_temp = df.copy()
    df_temp['rating_category'] = pd.cut(df_temp['average_rating'], bins=rating_bins, 
                                       labels=rating_labels, include_lowest=True)
    rating_counts = df_temp['rating_category'].value_counts()
    
    colors = ['lightcoral', 'gold', 'lightgreen', 'darkgreen']
    wedges, texts, autotexts = axes[1, 1].pie(rating_counts.values, 
                                             labels=rating_counts.index,
                                             autopct='%1.1f%%', 
                                             colors=colors, 
                                             startangle=90)
    axes[1, 1].set_title('Games by Rating Category')
    
    plt.tight_layout()
    plt.savefig('plots/rating_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved rating_analysis.png")

Saved rating_analysis.png


In [5]:
# --- Temporal Analysis ---

if 'year_published' in df.columns:
    plt.figure(figsize=(14, 10))
    
    # Filter reasonable years
    year_data = df[(df['year_published'] >= 1950) & (df['year_published'] <= 2025)]
    
    # Games published over time
    plt.subplot(2, 1, 1)
    year_counts = year_data.groupby('year_published').size()
    
    plt.plot(year_counts.index, year_counts.values, linewidth=2, color='steelblue', marker='o', markersize=3)
    plt.fill_between(year_counts.index, year_counts.values, alpha=0.3, color='steelblue')
    plt.title('Board Games Published Over Time')
    plt.xlabel('Year Published')
    plt.ylabel('Number of Games')
    plt.grid(True, alpha=0.3)
    
    # Average rating over time (if available)
    if 'average_rating' in df.columns:
        plt.subplot(2, 1, 2)
        rating_by_year = year_data.groupby('year_published')['average_rating'].mean()
        
        # Only plot if we have enough data points
        if len(rating_by_year) > 10:
            plt.plot(rating_by_year.index, rating_by_year.values, linewidth=2, color='red', marker='o', markersize=3)
            plt.title('Average Game Rating Over Time')
            plt.xlabel('Year Published')
            plt.ylabel('Average Rating')
            plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('plots/temporal_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved temporal_analysis.png")


Saved temporal_analysis.png


In [6]:
# --- Game Recommendations Dashboard ---
if 'average_rating' in df.columns and 'users_rated' in df.columns and 'name' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Popularity vs Rating scatter
    valid_data = df[(df['average_rating'].notna()) & (df['users_rated'].notna())]
    
    axes[0, 0].scatter(valid_data['users_rated'], valid_data['average_rating'], alpha=0.6, s=30)
    axes[0, 0].set_xlabel('Number of Users Rated')
    axes[0, 0].set_ylabel('Average Rating')
    axes[0, 0].set_title('Game Popularity vs Rating')
    axes[0, 0].set_xscale('log')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Most popular games
    popular_games = df.nlargest(10, 'users_rated')[['name', 'users_rated']]
    
    # Truncate long names for display
    display_names = [name[:30] + '...' if len(name) > 30 else name for name in popular_games['name']]
    
    axes[0, 1].barh(range(len(popular_games)), popular_games['users_rated'], color='orange')
    axes[0, 1].set_yticks(range(len(popular_games)))
    axes[0, 1].set_yticklabels(display_names, fontsize=8)
    axes[0, 1].set_xlabel('Number of Users Rated')
    axes[0, 1].set_title('Most Popular Games')
    axes[0, 1].invert_yaxis()
    
    # Highest rated games (with minimum ratings threshold)
    min_ratings = 50  # Minimum number of ratings for consideration
    highly_rated = df[df['users_rated'] >= min_ratings].nlargest(10, 'average_rating')[['name', 'average_rating']]
    
    display_names_rated = [name[:30] + '...' if len(name) > 30 else name for name in highly_rated['name']]
    
    axes[1, 0].barh(range(len(highly_rated)), highly_rated['average_rating'], color='green')
    axes[1, 0].set_yticks(range(len(highly_rated)))
    axes[1, 0].set_yticklabels(display_names_rated, fontsize=8)
    axes[1, 0].set_xlabel('Average Rating')
    axes[1, 0].set_title(f'Highest Rated Games ({min_ratings}+ ratings)')
    axes[1, 0].invert_yaxis()
    
    # Hidden gems (high rating, moderate user count)
    hidden_gems = df[(df['average_rating'] >= 7.5) & 
                    (df['users_rated'] >= 50) & 
                    (df['users_rated'] <= 500)]
    
    if len(hidden_gems) > 0:
        axes[1, 1].scatter(hidden_gems['users_rated'], hidden_gems['average_rating'], 
                          color='red', s=50, alpha=0.7)
        axes[1, 1].set_xlabel('Number of Users Rated')
        axes[1, 1].set_ylabel('Average Rating')
        axes[1, 1].set_title(f'Hidden Gems ({len(hidden_gems)} games)')
        axes[1, 1].grid(True, alpha=0.3)
    else:
        axes[1, 1].text(0.5, 0.5, 'No Hidden Gems\nFound', ha='center', va='center', 
                       transform=axes[1, 1].transAxes, fontsize=14)
        axes[1, 1].set_title('Hidden Gems Analysis')
    
    plt.tight_layout()
    plt.savefig('plots/recommendations_dashboard.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved recommendations_dashboard.png")



Saved recommendations_dashboard.png


In [7]:
# --- Summary Statistics ---

# Print summary to console
print(f"Total games: {len(df)}")

if 'year_published' in df.columns:
    year_range = df['year_published'].dropna()
    if len(year_range) > 0:
        print(f"Year range: {int(year_range.min())} - {int(year_range.max())}")

if 'average_rating' in df.columns:
    avg_rating = df['average_rating'].mean()
    print(f"Average rating: {avg_rating:.2f}")

if 'users_rated' in df.columns:
    total_ratings = df['users_rated'].sum()
    avg_ratings_per_game = df['users_rated'].mean()
    print(f"Total ratings: {total_ratings:,.0f}")
    print(f"Average ratings per game: {avg_ratings_per_game:.0f}")

Total games: 7575
Year range: 0 - 2025
Average rating: 6.69
Total ratings: 16,134,960
Average ratings per game: 2130


In [8]:
# List all generated plots
plot_files = []
for filename in os.listdir('plots'):
    if filename.endswith('.png'):
        plot_files.append(filename)

for i, filename in enumerate(sorted(plot_files), 1):
    print(f"  {i}. {filename}")

print(f"\nTotal plots generated: {len(plot_files)}")
print("All visualizations completed successfully!")

  1. correlation_heatmap.png
  2. numerical_distributions.png
  3. rating_analysis.png
  4. recommendations_dashboard.png
  5. temporal_analysis.png
  6. top_10_artists.png
  7. top_10_categories.png
  8. top_10_designers.png
  9. top_10_mechanisms.png
  10. top_10_publishers.png

Total plots generated: 10
All visualizations completed successfully!
