In [13]:
!pip -q install pandas matplotlib pyarrow fastparquet

import os, io, json, re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
print("Pandas:", pd.__version__)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hPandas: 2.2.2


In [14]:
from google.colab import files
print("Select your dataset file (e.g., ACNdata.json or ACNdata.csv)")
uploaded = files.upload()  # pick your ACNdata file in the dialog

DS_PATH = next(iter(uploaded)) if uploaded else None
print("DS_PATH =", DS_PATH)


Select your dataset file (e.g., ACNdata.json or ACNdata.csv)


Saving ACNdata.json to ACNdata (1).json
DS_PATH = ACNdata (1).json


In [15]:
ext = Path(DS_PATH).suffix.lower()
print("Detected extension:", ext)

raw_text = None
if ext in [".json", ".ndjson", ".txt"]:
    with open(DS_PATH, "r", encoding="utf-8", errors="ignore") as f:
        raw_text = f.read()
    print("First 300 chars:\n", raw_text[:300].replace("\n", " "))
else:
    print("Non-text format (likely CSV/Excel) – skipping raw preview.")


Detected extension: .json
First 300 chars:
 {   "_meta": {     "end": "Sat, 01 Apr 2023 23:59:00 GMT",     "min_kWh": 1,     "site": "caltech",     "start": "Fri, 01 Apr 2022 00:00:00 GMT"   },   "_items": [] }


In [16]:
def try_load_json(text):
    try:
        return json.loads(text), None
    except Exception as e:
        return None, str(e)

def attempt_safe_repair(original_text, out_path):
    """
    Minimal repair for this pattern:
      { "_meta": {...}, "_items": [
    It preserves _meta and sets _items to [].
    """
    meta_match = re.search(r'"_meta"\s*:\s*\{.*?\}', original_text, flags=re.DOTALL)
    if not meta_match:
        return None, None, "No _meta block found; cannot safely repair."

    meta_obj_text_match = re.search(r'\{.*\}', meta_match.group(0), flags=re.DOTALL)
    if not meta_obj_text_match:
        return None, None, "Could not isolate _meta object."

    meta_obj = json.loads(meta_obj_text_match.group(0))
    repaired = {"_meta": meta_obj, "_items": []}

    with open(out_path, "w", encoding="utf-8") as g:
        json.dump(repaired, g, indent=2)
    return repaired, out_path, "Repaired by preserving _meta and setting _items to []."

# Try parse if JSON
loaded_obj = None
repaired_note = None

if ext in [".json", ".ndjson", ".txt"]:
    loaded_obj, err = try_load_json(raw_text)
    if err:
        print("JSON parse error ->", err)
        # If ends right after "_items": [
        if raw_text and re.search(r'"_items"\s*:\s*\[\s*$', raw_text.strip()):
            print("Detected truncation at _items... attempting safe repair...")
            REPAIRED_PATH = str(Path(DS_PATH).with_name(Path(DS_PATH).stem + "_repaired.json"))
            loaded_obj, new_path, repaired_note = attempt_safe_repair(raw_text, REPAIRED_PATH)
            if loaded_obj is not None:
                DS_PATH = new_path  # use repaired file going forward
                print("Repair successful. New file ->", DS_PATH)
                print("Note:", repaired_note)
            else:
                print("Repair failed:", repaired_note)
        else:
            print("No safe auto-repair pattern recognized. Please re-export the JSON cleanly.")
    else:
        print("JSON parsed successfully without repair.")
else:
    print("Skipping JSON validation; non-JSON format detected.")


JSON parsed successfully without repair.


In [17]:
def load_to_dataframe(path):
    ext = Path(path).suffix.lower()
    if ext == ".csv":
        return pd.read_csv(path)

    if ext in [".json", ".ndjson", ".txt"]:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            txt = f.read()
        obj = json.loads(txt)

        # Capture meta if present
        meta = obj.get("_meta", {}) if isinstance(obj, dict) else {}
        if meta:
            print("META:", meta)

        if isinstance(obj, dict):
            if "_items" in obj and isinstance(obj["_items"], list):
                return pd.DataFrame(obj["_items"])
            # Fallback: flatten dict
            return pd.json_normalize(obj)

        if isinstance(obj, list):
            return pd.DataFrame(obj)

        raise ValueError("Unsupported JSON structure.")

    raise ValueError(f"Unsupported extension: {ext}")

df = load_to_dataframe(DS_PATH)
print("DataFrame shape:", df.shape)
df.head(10)


META: {'end': 'Sat, 01 Apr 2023 23:59:00 GMT', 'min_kWh': 1, 'site': 'caltech', 'start': 'Fri, 01 Apr 2022 00:00:00 GMT'}
DataFrame shape: (0, 0)


In [20]:
# ===== Step 7 (robust) — Inspect schema & basic info =====
from IPython.display import display
import numpy as np

# 0) Quick sanity
try:
    _ = df
except NameError:
    raise RuntimeError("`df` is not defined. Run Step 6 to load the dataset first.")

print("Shape:", df.shape)
print("\nColumns:", list(df.columns))

# 1) Dtypes and nulls (works even if empty)
print("\nDtypes:")
print(df.dtypes)

print("\nNulls per column:")
print(df.isnull().sum().sort_values(ascending=False))

# 2) Describe safely
if df.empty:
    print("\n⚠️ DataFrame is EMPTY — cannot compute describe().")
    print("Go back to Step 6 and confirm DS_PATH and the file contents.")
else:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()

    if num_cols:
        print("\nDescriptive stats (numeric):")
        display(df[num_cols].describe())
    else:
        print("\n(No numeric columns found to describe.)")

    if obj_cols:
        print("\nDescriptive stats (object):")
        display(df[obj_cols].describe())
    else:
        print("\n(No object columns found to describe.)")


Shape: (0, 0)

Columns: []

Dtypes:
Series([], dtype: object)

Nulls per column:
Series([], dtype: float64)

⚠️ DataFrame is EMPTY — cannot compute describe().
Go back to Step 6 and confirm DS_PATH and the file contents.


In [22]:
# 8.1 Duplicates
dup_count = df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")

# 8.2 Negative values in numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns
neg_summary = {col: int((df[col] < 0).sum()) for col in num_cols}
print("Negative values per numeric column:", neg_summary)

# 8.3 Parse likely datetime columns (non-destructive: coerce errors)
date_like_cols = [c for c in df.columns if any(k in c.lower() for k in ["time","date","start","end","timestamp"])]
for c in date_like_cols:
    try:
        df[c] = pd.to_datetime(df[c], errors="coerce")
    except Exception:
        pass
print("Datetime-parsed columns:", date_like_cols)

# 8.4 Example domain rule: energy must be >= 0 and not absurdly large
energy_cols = [c for c in df.columns if c.lower() in ["kwh","energy_kwh","energy"]]
for c in energy_cols:
    invalid = df[(df[c] < 0) | (df[c] > 1e6)]
    print(f"Out-of-range rows in {c}: {len(invalid)}")


Duplicate rows: 0
Negative values per numeric column: {}
Datetime-parsed columns: []


In [23]:
# Histograms for first few numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", num_cols)

for col in num_cols[:8]:  # limit to first 8 to keep it quick
    plt.figure()
    df[col].plot(kind="hist", bins=50, title=f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


Numeric columns: []


In [24]:
# Histograms for first few numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", num_cols)

for col in num_cols[:8]:  # limit to first 8 to keep it quick
    plt.figure()
    df[col].plot(kind="hist", bins=50, title=f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


Numeric columns: []


In [25]:
df_clean = df.copy()

# Drop exact duplicate rows
df_clean = df_clean.drop_duplicates().reset_index(drop=True)

# Fill numeric NA with median (change if you prefer mean or specific values)
for col in df_clean.select_dtypes(include=[np.number]).columns:
    if df_clean[col].isna().any():
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Fill string NA with 'Unknown' (or drop if appropriate)
for col in df_clean.select_dtypes(include=[object]).columns:
    if df_clean[col].isna().any():
        df_clean[col] = df_clean[col].fillna("Unknown")

print("Cleaned shape:", df_clean.shape)
df_clean.head()


Cleaned shape: (0, 0)


In [26]:
out_base = Path(DS_PATH).with_suffix("")  # remove extension
csv_path = str(out_base) + "_clean.csv"
parquet_path = str(out_base) + "_clean.parquet"

df_clean.to_csv(csv_path, index=False)
df_clean.to_parquet(parquet_path, index=False)

print("Saved:")
print(" -", csv_path)
print(" -", parquet_path)


Saved:
 - ACNdata (1)_clean.csv
 - ACNdata (1)_clean.parquet
