In [4]:
import zipfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec


In [2]:
zip_path = "data/timeseries_data.zip"  
extract_to = "data/timeseries_data_dump" 

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)


### Summary

In [5]:

csv_dir = "data/timeseries_data_dump/timeseries_data/"
output_csv = "data_summary.csv"

columns = ["CSV File Name", "Shape", "Num Columns", "Num Numerical", "Num Textual", "NULLS %", "Zeros"]
pd.DataFrame(columns=columns).to_csv(output_csv, index=False)

def Nulls(df):
    return {col: f"{df[col].isnull().mean() * 100:.1f}%" for col in df.columns}

def Zeros(df):
    return {
        col: f"{(df[col] == 0).mean() * 100:.1f}%" if pd.api.types.is_numeric_dtype(df[col]) else "0.0%"
        for col in df.columns
    }

for file in os.listdir(csv_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(csv_dir, file)
        df = pd.read_csv(file_path)
        
        num_columns = df.shape[1]
        num_rows = df.shape[0]
        
        generic_col_map = {col: f"col{i+1}" for i, col in enumerate(df.columns)}
        
        numerical_cols = df.select_dtypes(include=[np.number])
        textual_cols = df.select_dtypes(include=["object", "string", "category"])
        
        num_numerical = len(numerical_cols.columns)
        num_textual = len(textual_cols.columns)
        
        nulls_dict = Nulls(df)
        zeros_dict = Zeros(df)
        
        nulls_str = ", ".join([f"{generic_col_map[k]}: {v}" for k, v in nulls_dict.items()])
        zeros_str = ", ".join([f"{generic_col_map[k]}: {v}" for k, v in zeros_dict.items()])
        
        row = {
            "CSV File Name": file,
            "Shape": f"{num_rows} x {num_columns}",
            "Num Columns": num_columns,
            "Num Numerical": num_numerical,
            "Num Textual": num_textual,
            "NULLS %": nulls_str,
            "Zeros": zeros_str
        }
        
        print(f"Shape: {row['Shape']}")
        print(f"Num Columns: {row['Num Columns']}, Num Numerical: {row['Num Numerical']}, Num Textual: {row['Num Textual']}")
        print("NULLs %:", row["NULLS %"])
        print("Zeros %:", row["Zeros"])
        
        # Append real row to CSV
        pd.DataFrame([row]).to_csv(output_csv, mode='a', header=False, index=False)


Shape: 177 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 0.0%
Zeros %: col1: 0.0%, col2: 70.1%, col3: 0.0%, col4: 0.0%
Shape: 252 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 0.0%
Zeros %: col1: 0.0%, col2: 72.2%, col3: 0.0%, col4: 0.0%
Shape: 228 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 0.0%
Zeros %: col1: 0.0%, col2: 86.0%, col3: 0.0%, col4: 0.0%
Shape: 252 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 0.0%
Zeros %: col1: 0.0%, col2: 73.0%, col3: 0.0%, col4: 0.0%
Shape: 252 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 0.0%
Zeros %: col1: 0.0%, col2: 92.1%, col3: 0.8%, col4: 0.0%
Shape: 252 x 4
Num Columns: 4, Num Numerical: 3, Num Textual: 1
NULLs %: col1: 0.0%, col2: 0.0%, col3: 0.0%, col4: 

In [7]:
# Counters
files_with_high_nulls = 0
files_with_high_zeros = 0
files_with_both = 0
total_files = 0

for file in os.listdir(csv_dir):
    if file.endswith(".csv"):
        total_files+=1
        file_path = os.path.join(csv_dir, file)
        df = pd.read_csv(file_path)

        # Compute % nulls and % zeros
        nulls = df.isnull().mean() * 100
        zeros = df.select_dtypes(include=np.number).eq(0).mean() * 100

        # Check conditions
        has_high_null = (nulls > 20).any()
        has_high_zero = (zeros > 20).any()

        if has_high_null:
            files_with_high_nulls += 1
        if has_high_zero:
            files_with_high_zeros += 1
        if has_high_null and has_high_zero:
            files_with_both += 1

print(f"Files with >20% NULLs in any column     : {files_with_high_nulls}")
print(f"Files with >20% ZEROs in any numeric col: {files_with_high_zeros}")
print(f"Files with BOTH >20% NULLs and ZEROs    : {files_with_both}")


Files with >20% NULLs in any column     : 1196
Files with >20% ZEROs in any numeric col: 3790
Files with BOTH >20% NULLs and ZEROs    : 1081


In [8]:
print(total_files)

4110


In [None]:
import pandas as pd
import os
import numpy as np

csv_dir = "data/timeseries_data_dump/timeseries_data/"

summary_stats = []

for file in os.listdir(csv_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(csv_dir, file)
        df = pd.read_csv(file_path)

        nulls_pct = df.isnull().mean() * 100
        zeros_pct = df.select_dtypes(include=np.number).eq(0).mean() * 100

        high_null_cols = nulls_pct[nulls_pct > 20].index.tolist()
        high_zero_cols = zeros_pct[zeros_pct > 20].index.tolist()

        has_high_nulls = len(high_null_cols) > 0
        has_high_zeros = len(high_zero_cols) > 0

        summary_stats.append({
            "File": file,
            "Has >20% NULLs?": has_high_nulls,
            "NULL Columns >20%": ", ".join(high_null_cols) if has_high_nulls else "None",
            "Has >20% ZEROs?": has_high_zeros,
            "ZERO Columns >20%": ", ".join(high_zero_cols) if has_high_zeros else "None",
            "Has Both?": has_high_nulls and has_high_zeros
        })

stats_df = pd.DataFrame(summary_stats)


for _, row in stats_df.iterrows():
    print(f"{row['File']}")
    if row["Has >20% NULLs?"]:
        print(f"NULL >20% columns : {row['NULL Columns >20%']}")
    if row["Has >20% ZEROs?"]:
        print(f"ZEROs >20% columns: {row['ZERO Columns >20%']}")
    if row["Has Both?"]:
        print("File has BOTH high NULLs and ZEROs")
    print("-" * 60)


### Plots

In [None]:
folder_path = "data/timeseries_data_dump/timeseries_data/"

output_dir = os.path.join(folder_path, "plots")
os.makedirs(output_dir, exist_ok=True)

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')][:10]

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    date_col = None
    for col in df.columns:
        try:
            parsed = pd.to_datetime(df[col], errors='raise')
            df[col] = parsed
            date_col = col
            break
        except:
            continue

    if date_col is None:
        print(f"{file} - No valid date column.")
        continue

    df = df.rename(columns={date_col: 'DATE'})
    df = df.sort_values(by='DATE')

    numeric_cols = df.select_dtypes(include=['number']).columns
    valid_cols = [col for col in numeric_cols if df[col].notna().any()]

    if not valid_cols:
        print(f"{file} - No valid numeric features.")
        continue

    n_features = len(valid_cols)
    total_rows = n_features + 1

    fig = plt.figure(figsize=(12, 3 * total_rows))
    gs = gridspec.GridSpec(total_rows, 1, hspace=0.5)

    ax_main = fig.add_subplot(gs[0])
    for col in valid_cols:
        ax_main.plot(df['DATE'], df[col], label=col)
    ax_main.set_title(f"{file} - Consolidated Plot")
    ax_main.set_xlabel("Date")
    ax_main.set_ylabel("Value")
    ax_main.legend()
    ax_main.grid(True)

    for i, col in enumerate(valid_cols):
        ax = fig.add_subplot(gs[i + 1])
        ax.plot(df['DATE'], df[col], color='tab:blue')
        ax.set_title(f"{col} vs Date")
        ax.set_xlabel("Date")
        ax.set_ylabel(col)
        ax.grid(True)

    plot_filename = os.path.join(output_dir, f"{os.path.splitext(file)[0]}_combined.png")
    plt.tight_layout()
    plt.savefig(plot_filename)
    plt.close()

    print(f"Saved combined plot: {plot_filename}")

### Unwindowed dataset conversion