# PT XYZ Data Warehouse Analysis

This notebook provides analysis capabilities for the PT XYZ mining data warehouse.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

## Database Connection Setup

In [None]:
# Database connection parameters
server = 'sqlserver,1433'
database = 'DW_PTXYZ'
username = 'sa'
password = 'PTXYZDataWarehouse2025!'

# Create connection string
conn_str = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(conn_str)

print("Database connection configured successfully!")

## Data Loading and Analysis

In [None]:
# Load data from CSV files for initial analysis
try:
    production_df = pd.read_csv('/home/jovyan/work/data/dataset_production.csv')
    equipment_df = pd.read_csv('/home/jovyan/work/data/dataset_alat_berat_dw.csv')
    
    print("Production Data Shape:", production_df.shape)
    print("Equipment Data Shape:", equipment_df.shape)
    
    print("\nProduction Data Columns:")
    print(production_df.columns.tolist())
    
    print("\nEquipment Data Columns:")
    print(equipment_df.columns.tolist())
    
except Exception as e:
    print(f"Error loading data: {e}")

In [None]:
# Display first few rows of production data
if 'production_df' in locals():
    display(production_df.head())
    print("\nProduction Data Info:")
    production_df.info()

In [None]:
# Display first few rows of equipment data
if 'equipment_df' in locals():
    display(equipment_df.head())
    print("\nEquipment Data Info:")
    equipment_df.info()

## Production Analysis

In [None]:
# Production analysis visualizations
if 'production_df' in locals() and not production_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('PT XYZ Production Analysis', fontsize=16)
    
    # Example plots - adjust based on actual data columns
    # You'll need to modify these based on your actual data structure
    
    # Plot 1: Production over time (if date column exists)
    if 'date' in production_df.columns:
        production_df['date'] = pd.to_datetime(production_df['date'])
        daily_production = production_df.groupby('date').sum()
        axes[0,0].plot(daily_production.index, daily_production.iloc[:,0])
        axes[0,0].set_title('Daily Production Trend')
        axes[0,0].tick_params(axis='x', rotation=45)
    
    # Plot 2: Production by material type (if material column exists)
    if 'material' in production_df.columns:
        material_production = production_df.groupby('material').sum()
        axes[0,1].bar(material_production.index, material_production.iloc[:,0])
        axes[0,1].set_title('Production by Material Type')
        axes[0,1].tick_params(axis='x', rotation=45)
    
    # Plot 3: Production distribution
    numeric_cols = production_df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        axes[1,0].hist(production_df[numeric_cols[0]], bins=20, alpha=0.7)
        axes[1,0].set_title(f'Distribution of {numeric_cols[0]}')
    
    # Plot 4: Correlation heatmap
    if len(numeric_cols) > 1:
        corr_matrix = production_df[numeric_cols].corr()
        sns.heatmap(corr_matrix, annot=True, ax=axes[1,1], cmap='coolwarm')
        axes[1,1].set_title('Production Data Correlation')
    
    plt.tight_layout()
    plt.show()

## Equipment Analysis

In [None]:
# Equipment analysis visualizations
if 'equipment_df' in locals() and not equipment_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('PT XYZ Equipment Analysis', fontsize=16)
    
    numeric_cols = equipment_df.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) >= 2:
        # Equipment efficiency scatter plot
        axes[0,0].scatter(equipment_df[numeric_cols[0]], equipment_df[numeric_cols[1]], alpha=0.6)
        axes[0,0].set_xlabel(numeric_cols[0])
        axes[0,0].set_ylabel(numeric_cols[1])
        axes[0,0].set_title('Equipment Performance Scatter')
        
        # Equipment utilization distribution
        axes[0,1].hist(equipment_df[numeric_cols[0]], bins=15, alpha=0.7, color='orange')
        axes[0,1].set_title(f'Distribution of {numeric_cols[0]}')
        
        # Box plot for equipment performance
        axes[1,0].boxplot([equipment_df[col].dropna() for col in numeric_cols[:3]], 
                         labels=numeric_cols[:3])
        axes[1,0].set_title('Equipment Performance Box Plot')
        axes[1,0].tick_params(axis='x', rotation=45)
        
        # Correlation heatmap
        if len(numeric_cols) > 2:
            corr_matrix = equipment_df[numeric_cols].corr()
            sns.heatmap(corr_matrix, annot=True, ax=axes[1,1], cmap='viridis')
            axes[1,1].set_title('Equipment Data Correlation')
    
    plt.tight_layout()
    plt.show()

## SQL Queries for Data Warehouse Analysis

In [None]:
# Example SQL queries for data warehouse analysis
queries = {
    'production_summary': """
        SELECT 
            material_name,
            SUM(volume_produced) as total_production,
            AVG(volume_produced) as avg_production
        FROM fact_production fp
        JOIN dim_material dm ON fp.material_key = dm.material_key
        GROUP BY material_name
        ORDER BY total_production DESC
    """,
    
    'equipment_utilization': """
        SELECT 
            equipment_name,
            equipment_type,
            SUM(operating_hours) as total_operating_hours,
            SUM(idle_hours) as total_idle_hours,
            (SUM(operating_hours) / (SUM(operating_hours) + SUM(idle_hours)) * 100) as utilization_percent
        FROM fact_equipment_usage feu
        JOIN dim_equipment de ON feu.equipment_key = de.equipment_key
        GROUP BY equipment_name, equipment_type
        ORDER BY utilization_percent DESC
    """,
    
    'monthly_production': """
        SELECT 
            dt.year,
            dt.month,
            SUM(fp.volume_produced) as monthly_production
        FROM fact_production fp
        JOIN dim_time dt ON fp.time_key = dt.time_key
        GROUP BY dt.year, dt.month
        ORDER BY dt.year, dt.month
    """
}

# Execute queries (uncomment when database is ready)
# for query_name, query in queries.items():
#     try:
#         result = pd.read_sql(query, engine)
#         print(f"\n{query_name.upper()} RESULTS:")
#         display(result)
#     except Exception as e:
#         print(f"Error executing {query_name}: {e}")

print("SQL queries defined. Uncomment the execution block when database is ready.")

## Summary Statistics

In [None]:
# Generate summary statistics
print("=== PT XYZ DATA WAREHOUSE SUMMARY ===")

if 'production_df' in locals():
    print(f"\nProduction Data:")
    print(f"- Total records: {len(production_df)}")
    print(f"- Columns: {len(production_df.columns)}")
    numeric_cols = production_df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"- Numeric columns summary:")
        display(production_df[numeric_cols].describe())

if 'equipment_df' in locals():
    print(f"\nEquipment Data:")
    print(f"- Total records: {len(equipment_df)}")
    print(f"- Columns: {len(equipment_df.columns)}")
    numeric_cols = equipment_df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"- Numeric columns summary:")
        display(equipment_df[numeric_cols].describe())