# Steam Games - Exploratory Data Analysis

Este notebook realiza un análisis exploratorio del dataset de videojuegos de Steam.

## Contenido
1. Carga de datos
2. Análisis de estructura
3. Estadísticas descriptivas
4. Visualizaciones
5. Análisis de correlaciones

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append('../src')
from utils import display_dataset_info, parse_owners_range

# Configuración de visualización
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

## 1. Carga de Datos

In [None]:
# Cargar datos raw
df = pd.read_csv('../data/raw/steam_games.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst rows:")
df.head()

## 2. Información del Dataset

In [None]:
# Información general
display_dataset_info(df)

In [None]:
# Valores únicos en columnas categóricas
categorical_cols = ['genres', 'categories', 'platforms', 'developer', 'publisher']

for col in categorical_cols:
    if col in df.columns:
        n_unique = df[col].nunique()
        print(f"{col}: {n_unique} valores únicos")
        print(f"  Top 5: {df[col].value_counts().head(5).to_dict()}\n")

## 3. Estadísticas Descriptivas

In [None]:
# Estadísticas numéricas
df.describe()

In [None]:
# Parsear owners a numérico
if 'owners' in df.columns:
    df['owners_mid'] = df['owners'].apply(parse_owners_range)
    print("Estadísticas de owners_mid:")
    print(df['owners_mid'].describe())

## 4. Visualizaciones

In [None]:
# Distribución de precios
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Todos los precios
if 'price' in df.columns:
    axes[0].hist(df['price'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Price')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Price Distribution (All Games)')
    
    # Solo juegos de pago
    paid_games = df[df['price'] > 0]['price']
    axes[1].hist(paid_games.dropna(), bins=50, edgecolor='black', alpha=0.7, color='orange')
    axes[1].set_xlabel('Price')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Price Distribution (Paid Games Only)')

plt.tight_layout()
plt.show()

In [None]:
# Distribución de owners
if 'owners_mid' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
    # Escala normal
    axes[0].hist(df['owners_mid'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Owners (Mid Range)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Owners Distribution')
    
    # Escala logarítmica
    axes[1].hist(np.log10(df['owners_mid'].dropna() + 1), bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[1].set_xlabel('log10(Owners + 1)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Owners Distribution (Log Scale)')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Distribución de ratings
if 'positive_ratings' in df.columns and 'negative_ratings' in df.columns:
    df['total_ratings'] = df['positive_ratings'] + df['negative_ratings']
    df['positive_ratio'] = df['positive_ratings'] / (df['total_ratings'] + 1e-6)
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
    # Total ratings
    axes[0].hist(np.log10(df['total_ratings'] + 1), bins=50, edgecolor='black', alpha=0.7, color='purple')
    axes[0].set_xlabel('log10(Total Ratings + 1)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Total Ratings Distribution (Log Scale)')
    
    # Positive ratio
    axes[1].hist(df['positive_ratio'].dropna(), bins=50, edgecolor='black', alpha=0.7, color='teal')
    axes[1].set_xlabel('Positive Ratio')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Positive Ratings Ratio Distribution')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Top géneros
if 'genres' in df.columns:
    # Separar géneros múltiples
    all_genres = []
    for genres in df['genres'].dropna():
        if isinstance(genres, str):
            all_genres.extend([g.strip() for g in genres.split(',')])
    
    genre_counts = pd.Series(all_genres).value_counts().head(15)
    
    plt.figure(figsize=(12, 6))
    genre_counts.plot(kind='barh', color='steelblue')
    plt.xlabel('Number of Games')
    plt.title('Top 15 Genres')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 5. Análisis de Correlaciones

In [None]:
# Matriz de correlación
numeric_cols = ['price', 'positive_ratings', 'negative_ratings', 'owners_mid', 'total_ratings', 'positive_ratio']
numeric_cols = [col for col in numeric_cols if col in df.columns]

if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

In [None]:
# Scatter plots
if 'price' in df.columns and 'owners_mid' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Price vs Owners
    axes[0].scatter(df['price'], np.log10(df['owners_mid'] + 1), alpha=0.3)
    axes[0].set_xlabel('Price')
    axes[0].set_ylabel('log10(Owners + 1)')
    axes[0].set_title('Price vs Owners')
    axes[0].grid(alpha=0.3)
    
    # Positive ratio vs Owners
    if 'positive_ratio' in df.columns:
        axes[1].scatter(df['positive_ratio'], np.log10(df['owners_mid'] + 1), alpha=0.3, color='orange')
        axes[1].set_xlabel('Positive Ratio')
        axes[1].set_ylabel('log10(Owners + 1)')
        axes[1].set_title('Positive Ratio vs Owners')
        axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 6. Conclusiones del EDA

En esta sección resume tus hallazgos:
- ¿Cómo se distribuyen las variables principales?
- ¿Hay valores atípicos?
- ¿Qué correlaciones son significativas?
- ¿Qué necesidades de preprocesamiento identificaste?