
# Google Cluster Data â€” Colab EDA (Auto-Download)

This notebook is designed for **Google Colab**. It will:
- Create a workspace under `/content/google_cluster_data`
- **Download** the 2019 Google Cluster Data schema and a small sample of `task_usage`
- **Decompress** the sample
- Run first-pass EDA geared toward **cloud autoscaling**

> Tip: If you want files to persist, mount Google Drive in the next cell.


In [None]:

# (Optional) Mount Google Drive to persist files between sessions.
# Uncomment the next two lines if you'd like to save under /content/drive/MyDrive
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:

import os, sys, gzip, shutil, urllib.request, subprocess, io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Matplotlib inline in Colab
%matplotlib inline

# Choose a root directory; change to a Drive path if mounted, e.g. '/content/drive/MyDrive/google_cluster_data'
ROOT_DIR = "/content/google_cluster_data"
DOCS_DIR = os.path.join(ROOT_DIR, "docs")
DATA_DIR = os.path.join(ROOT_DIR, "data_sample")

SCHEMA_URL = "https://storage.googleapis.com/google-clusterdata-2019/schema.csv"
SAMPLE_URL = "https://storage.googleapis.com/google-clusterdata-2019/task_usage/part-00000-of-00500.csv.gz"

SCHEMA_OUT = os.path.join(DOCS_DIR, "schema.csv")
SAMPLE_GZ_OUT = os.path.join(DATA_DIR, "part-00000-of-00500.csv.gz")
SAMPLE_CSV_OUT = os.path.join(DATA_DIR, "part-00000-of-00500.csv")

os.makedirs(DOCS_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

def _have(cmd):
    try:
        subprocess.run([cmd, "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
        return True
    except Exception:
        return False

def download(url, out_path):
    """Download URL to out_path using wget/curl if available; otherwise urllib."""
    if _have("wget"):
        code = subprocess.call(["wget", "-q", "-O", out_path, url])
        if code == 0 and os.path.exists(out_path):
            return
    if _have("curl"):
        code = subprocess.call(["curl", "-L", "-sS", "-o", out_path, url])
        if code == 0 and os.path.exists(out_path):
            return
    # Fallback: urllib
    with urllib.request.urlopen(url) as r, open(out_path, "wb") as f:
        shutil.copyfileobj(r, f)

print("==> Downloading schema...")
download(SCHEMA_URL, SCHEMA_OUT)
print("    Wrote:", SCHEMA_OUT, os.path.getsize(SCHEMA_OUT), "bytes")

print("==> Downloading sample .gz...")
download(SAMPLE_URL, SAMPLE_GZ_OUT)
print("    Wrote:", SAMPLE_GZ_OUT, os.path.getsize(SAMPLE_GZ_OUT), "bytes")

print("==> Decompressing sample .gz to CSV...")
with gzip.open(SAMPLE_GZ_OUT, "rb") as gz, open(SAMPLE_CSV_OUT, "wb") as out:
    shutil.copyfileobj(gz, out)

print("    Wrote:", SAMPLE_CSV_OUT, os.path.getsize(SAMPLE_CSV_OUT), "bytes")

print("\nAll downloads complete. Paths ready:")
print("  Schema:", SCHEMA_OUT)
print("  Sample CSV:", SAMPLE_CSV_OUT)



## Load Data
Reads a subset first for speed. Set `nrows=None` to load the entire sample file.


In [None]:

DATA_PATH = "/content/google_cluster_data/data_sample/part-00000-of-00500.csv"
assert os.path.exists(DATA_PATH), f"Data file not found at {DATA_PATH}"
nrows = 500_000  # set to None to read the full file
df = pd.read_csv(DATA_PATH, nrows=nrows)
df.shape, df.head()



## Basic Info & Nulls


In [None]:

print("Columns:", list(df.columns))
print("\nShape:", df.shape)
print("\nNull counts (top 20):\n", df.isna().sum().sort_values(ascending=False).head(20))
df.describe(include='all').T.head(20)



## Numeric Columns (Common Autoscaling Signals)


In [None]:

candidate_numeric = ['cpu_rate', 'mem_usage', 'disk_io_time', 'disk_space', 'assigned_memory', 'page_cache']
numeric_cols = [c for c in candidate_numeric if c in df.columns]
print("Numeric columns found:", numeric_cols)

if numeric_cols:
    display(df[numeric_cols].describe())
else:
    print("No expected numeric columns found; check schema and column names.")



## Correlations


In [None]:

if len(numeric_cols) >= 2:
    corr = df[numeric_cols].corr()
    corr
else:
    print("Not enough numeric columns to compute correlations.")



## Distributions


In [None]:

for col in numeric_cols:
    plt.figure()
    df[col].dropna().hist(bins=50)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()



## Time Conversion
Attempts to convert `start_time` / `end_time` to datetimes. The trace often uses **nanoseconds since epoch**.


In [None]:

for tcol in ['start_time', 'end_time']:
    if tcol in df.columns:
        try:
            mean_val = df[tcol].dropna().astype(float).mean()
            if mean_val > 1e12:
                df[tcol] = pd.to_datetime(df[tcol], unit='ns', origin='unix', errors='coerce')
            else:
                df[tcol] = pd.to_datetime(df[tcol], unit='s', origin='unix', errors='coerce')
        except Exception as e:
            print(f"Warning: could not convert {tcol} -> datetime:", e)

[c for c in ['start_time','end_time'] if c in df.columns]



## Mean CPU by Hour


In [None]:

if 'start_time' in df.columns and 'cpu_rate' in df.columns:
    ts = df[['start_time','cpu_rate']].dropna().set_index('start_time').sort_index()
    hourly = ts['cpu_rate'].resample('1H').mean()
    display(hourly.head())
    plt.figure()
    hourly.plot()
    plt.title("Mean CPU rate by hour")
    plt.xlabel("Time")
    plt.ylabel("Mean CPU rate")
    plt.show()
else:
    print("Missing 'start_time' and/or 'cpu_rate'.")



## Lag Features (Predictive Signals)
Downsample to 5-minute intervals, then create lagged features.


In [None]:

if 'start_time' in df.columns and 'cpu_rate' in df.columns:
    ts5 = df[['start_time','cpu_rate']].dropna().set_index('start_time').sort_index()['cpu_rate'].resample('5T').mean()
    ts5 = ts5.interpolate(limit_direction='both')
    lagged = pd.DataFrame({
        'cpu_t': ts5,
        'cpu_t_1': ts5.shift(1),
        'cpu_t_2': ts5.shift(2),
        'cpu_t_12': ts5.shift(12),  # ~1 hour lag at 5-min res
    }).dropna()
    display(lagged.head())
    print("\nLag correlations:")
    display(lagged.corr())
else:
    print("Cannot compute lag features without 'start_time' and 'cpu_rate'.")



## Grouping (Machine/Job/Task)


In [None]:

group_keys = [c for c in ['machine_id', 'job_id', 'task_index'] if c in df.columns]
if 'cpu_rate' in df.columns and group_keys:
    key = group_keys[0]
    g = df[[key, 'cpu_rate']].dropna().groupby(key)['cpu_rate'].mean().sort_values(ascending=False).head(10)
    print(f"Top 10 {key} by mean CPU_rate:")
    display(g)
else:
    print("No grouping keys found or 'cpu_rate' missing.")



## Save Small Parquet Sample (optional)
Saves a random subset for faster reloads. Will attempt to install `pyarrow` if needed.


In [None]:

out_parquet = "/content/google_cluster_data/data_sample/sample_eda.parquet"
try:
    df.sample(min(len(df), 200_000), random_state=17).to_parquet(out_parquet, index=False)
    print(f"Saved sample to {out_parquet}")
except Exception as e:
    print("Parquet save failed; attempting to install pyarrow...")
    try:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])
        df.sample(min(len(df), 200_000), random_state=17).to_parquet(out_parquet, index=False)
        print(f"Saved sample to {out_parquet}")
    except Exception as e2:
        print("Parquet still unavailable. Skipping parquet export.", e2)



### Next Steps
- Concatenate additional `task_usage` parts for larger windows of time.
- Join with `task_events` / `machine_events` to add scheduling and capacity context.
- Engineer autoscaling features (rolling means/percentiles, sustained high-CPU windows).
- Prototype scaling policies: threshold-based vs. predictive vs. RL.
