In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('vgchartz-2024.csv')

# Create aggregated dataframe by title
df_aggregated = df.groupby('title').agg({
    'genre': 'first',  # Take the first genre (games usually keep same genre across platforms)
    'publisher': 'first',  # Take the first publisher
    'developer': 'first',  # Take the first developer
    'critic_score': 'mean',  # Average critic score across platforms
    'total_sales': 'sum',  # Sum of sales across platforms
    'na_sales': 'sum',
    'jp_sales': 'sum',
    'pal_sales': 'sum',
    'other_sales': 'sum',
    'console': lambda x: ', '.join(sorted(set(x))),  # List all platforms
    'release_date': 'min'  # First release date
}).reset_index()

# Add new features
df_aggregated['platform_count'] = df_aggregated['console'].str.count(',') + 1
df_aggregated['sales_ratio_na'] = df_aggregated['na_sales'] / df_aggregated['total_sales']
df_aggregated['sales_ratio_jp'] = df_aggregated['jp_sales'] / df_aggregated['total_sales']
df_aggregated['sales_ratio_pal'] = df_aggregated['pal_sales'] / df_aggregated['total_sales']
df_aggregated['sales_ratio_other'] = df_aggregated['other_sales'] / df_aggregated['total_sales']

# Display the first few rows and basic information
print("Original dataset shape:", df.shape)
print("\nAggregated dataset shape:", df_aggregated.shape)
print("\nFirst few rows of aggregated data:")
print(df_aggregated.head())
print("\nBasic statistics of numerical columns:")
print(df_aggregated.describe())