
# Google Cluster Data — Starter EDA

This notebook loads a small sample of the **Google Cluster Data (2019)** `task_usage` table and performs first-pass EDA suitable for cloud autoscaling research.

**Expected file (created by your setup script):**
```
google_cluster_data/data_sample/part-00000-of-00500.csv
```
If the path differs, update the `DATA_PATH` variable below.


In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Inline figures
%matplotlib inline

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

DATA_PATH = "google_cluster_data/data_sample/part-00000-of-00500.csv"
assert os.path.exists(DATA_PATH), f"Data file not found at {DATA_PATH}. Please run the setup script or correct the path."


In [None]:

# Load a subset first for speed; adjust nrows=None to load full file
nrows = 500000  # change to None for entire file
df = pd.read_csv(DATA_PATH, nrows=nrows)
df.shape, df.head()


In [None]:

print("Columns:", list(df.columns))
print("\nDataFrame shape:", df.shape)
print("\nNull counts:\n", df.isna().sum().sort_values(ascending=False).head(20))
df.describe(include='all').T.head(20)


In [None]:

candidate_numeric = ['cpu_rate', 'mem_usage', 'disk_io_time', 'disk_space', 'assigned_memory', 'page_cache']
numeric_cols = [c for c in candidate_numeric if c in df.columns]
print("Numeric columns found:", numeric_cols)
df[numeric_cols].describe()


In [None]:

if len(numeric_cols) >= 2:
    corr = df[numeric_cols].corr()
    corr
else:
    print("Not enough numeric columns found for correlation matrix.")


In [None]:

for col in numeric_cols:
    plt.figure()
    df[col].dropna().hist(bins=50)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


In [None]:

# Many Google traces store time in nanoseconds since epoch. Adjust if schema differs.
for tcol in ['start_time', 'end_time']:
    if tcol in df.columns:
        # Heuristic: if values are very large, assume nanoseconds
        if df[tcol].dropna().astype(float).mean() > 1e12:
            df[tcol] = pd.to_datetime(df[tcol], unit='ns', origin='unix', errors='coerce')
        else:
            # if already seconds
            df[tcol] = pd.to_datetime(df[tcol], unit='s', origin='unix', errors='coerce')

time_cols = [c for c in ['start_time', 'end_time'] if c in df.columns]
time_cols


In [None]:

if 'start_time' in df.columns and 'cpu_rate' in df.columns:
    ts = df[['start_time', 'cpu_rate']].dropna()
    ts = ts.set_index('start_time').sort_index()
    # Resample to 1-hour mean if frequency supports it
    hourly = ts['cpu_rate'].resample('1H').mean()
    display(hourly.head())
    plt.figure()
    hourly.plot()
    plt.title("Mean CPU rate by hour")
    plt.xlabel("Time")
    plt.ylabel("Mean CPU rate")
    plt.show()
else:
    print("Missing 'start_time' and/or 'cpu_rate' for time-based aggregation.")


In [None]:

if 'start_time' in df.columns and 'cpu_rate' in df.columns:
    # Downsample to 5-minute intervals for example
    ts5 = df[['start_time','cpu_rate']].dropna().set_index('start_time').sort_index()['cpu_rate'].resample('5T').mean()
    ts5 = ts5.interpolate(limit_direction='both')
    lagged = pd.DataFrame({
        'cpu_t': ts5,
        'cpu_t_1': ts5.shift(1),
        'cpu_t_2': ts5.shift(2),
        'cpu_t_12': ts5.shift(12)  # one hour back if 5T frequency
    }).dropna()
    display(lagged.head())
    print("\nLag correlations:")
    display(lagged.corr())
else:
    print("Cannot compute lag features without 'start_time' and 'cpu_rate'.")


In [None]:

group_keys = [c for c in ['machine_id', 'job_id', 'task_index'] if c in df.columns]
if 'cpu_rate' in df.columns and len(group_keys) > 0:
    key = group_keys[0]
    g = df[[key, 'cpu_rate']].dropna().groupby(key)['cpu_rate'].mean().sort_values(ascending=False).head(10)
    print(f"Top 10 {key} by mean CPU_rate:")
    display(g)
else:
    print("No grouping keys ('machine_id', 'job_id', 'task_index') found for grouping demo, or 'cpu_rate' missing.")


In [None]:

out_parquet = "google_cluster_data/data_sample/sample_eda.parquet"
small = df.sample(min(len(df), 200000), random_state=17)
small.to_parquet(out_parquet, index=False)
print(f"Saved sample to {out_parquet}")



## Next Steps

- Expand the sample: load additional `task_usage` parts and concatenate.
- Join with `task_events` and `machine_events` (if available) to enrich context (scheduling, capacity changes).
- Engineer autoscaling features (rolling means, percentiles, sustained high-CPU durations).
- Prototype policies:
  - Threshold-based: scale up when CPU > X% for Y minutes; scale down when < Z% for Y minutes.
  - Predictive: forecast CPU 5–15 minutes ahead and make proactive decisions.
  - RL: define state (recent utilization window), actions (scale up/down/hold), reward (SLO adherence minus cost).
