# Data Exploration: Alibaba Cloud Trace

This notebook explores the Alibaba cluster trace dataset to understand microservice interactions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')

from data_loader import TraceDataLoader
from preprocessor import TracePreprocessor

%matplotlib inline
sns.set_style('darkgrid')

## 1. Load Data

In [None]:
# Load trace data
loader = TraceDataLoader('../data/raw/')
traces = loader.load_trace_files()

print(f"Loaded {len(traces)} trace records")
print(f"\nColumns: {traces.columns.tolist()}")
traces.head()

## 2. Basic Statistics

In [None]:
# Basic statistics
print("Dataset Statistics:")
print(f"Total traces: {len(traces)}")
print(f"Unique services: {traces['service'].nunique() if 'service' in traces.columns else 'N/A'}")
print(f"Time range: {traces['timestamp'].min()} to {traces['timestamp'].max() if 'timestamp' in traces.columns else 'N/A'}")

traces.describe()

## 3. Latency Analysis

In [None]:
# Latency distribution
if 'latency' in traces.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    axes[0].hist(traces['latency'], bins=50, edgecolor='black')
    axes[0].set_xlabel('Latency (ms)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Latency Distribution')
    
    # Box plot
    axes[1].boxplot(traces['latency'])
    axes[1].set_ylabel('Latency (ms)')
    axes[1].set_title('Latency Box Plot')
    
    plt.tight_layout()
    plt.show()
    
    # Percentiles
    print("\nLatency Percentiles:")
    for p in [50, 90, 95, 99]:
        print(f"P{p}: {np.percentile(traces['latency'], p):.2f} ms")

## 4. Service Analysis

In [None]:
# Service call frequency
if 'service' in traces.columns:
    service_counts = traces['service'].value_counts().head(10)
    
    plt.figure(figsize=(12, 6))
    service_counts.plot(kind='bar')
    plt.xlabel('Service')
    plt.ylabel('Call Count')
    plt.title('Top 10 Services by Call Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 5. Data Quality Check

In [None]:
# Check for missing values
print("Missing Values:")
print(traces.isnull().sum())

# Check for duplicates
duplicates = traces.duplicated().sum()
print(f"\nDuplicate records: {duplicates}")

# Data types
print("\nData Types:")
print(traces.dtypes)