In [38]:
import os
import pandas as pd
from shutil import move
from datetime import datetime

# Define paths
STOCK_BATCHES_FOLDER = "stock_batches_processing"
MISALIGNED_FOLDER = os.path.join(STOCK_BATCHES_FOLDER, "misaligned")
CONTIGUOUS_INCOMPLETE_FOLDER = os.path.join(STOCK_BATCHES_FOLDER, "contiguous_incomplete")

os.makedirs(MISALIGNED_FOLDER, exist_ok=True)
os.makedirs(CONTIGUOUS_INCOMPLETE_FOLDER, exist_ok=True)

def classify_stock_data():
    for filename in os.listdir(STOCK_BATCHES_FOLDER):
        file_path = os.path.join(STOCK_BATCHES_FOLDER, filename)
        if not filename.endswith(".csv") or os.path.isdir(file_path):
            continue

        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"[ERROR] {filename} => Failed to read: {e}")
            continue

        if df.empty:
            move(file_path, os.path.join(MISALIGNED_FOLDER, filename))
            print(f"[ERROR] {filename} => Empty file")
            continue

        # Filter out 'Date' column to check for data sparsity
        data_cols = [col for col in df.columns if col.lower() != "date"]
        total_cells = df[data_cols].size
        missing_cells = df[data_cols].isna().sum().sum()
        if total_cells == 0 or (missing_cells / total_cells) > 0.9:
            move(file_path, os.path.join(MISALIGNED_FOLDER, filename))
            print(f"[WARNING] {filename} => >90% data missing")
            continue

        if "Date" not in df.columns:
            print(f"[WARNING] {filename} => No 'Date' column found")
            continue

        try:
            df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
            df = df.dropna(subset=["Date"])

            # Check if most recent date has data (sanity check)
            last_row = df.sort_values("Date", ascending=False).iloc[0]
            if last_row[data_cols].isna().all():
                move(file_path, os.path.join(MISALIGNED_FOLDER, filename))
                print(f"[WARNING] {filename} => Last row has no data")
                continue

            # Check for non-null values in 2004 or earlier
            cutoff = datetime(2004, 12, 31)
            pre2005_data = df[df["Date"] <= cutoff]
            if pre2005_data[data_cols].notna().sum().sum() == 0:
                move(file_path, os.path.join(CONTIGUOUS_INCOMPLETE_FOLDER, filename))
                print(f"[WARNING] {filename} => No valid data ≤ 2004")
        except Exception as e:
            print(f"[ERROR] {filename} => Date handling failed: {e}")

In [40]:
# Run it
classify_stock_data()

[ERROR] LINE.csv => Empty file
[ERROR] TYRA.csv => Empty file
[ERROR] WTO.csv => Empty file
[ERROR] HLNE.csv => Empty file
[ERROR] NVCT.csv => Empty file
[ERROR] SBGI.csv => Empty file
[ERROR] UFCS.csv => Empty file
[ERROR] SNRE.csv => Empty file
[ERROR] YXT.csv => Empty file
[ERROR] SBFM.csv => Empty file
[ERROR] RADX.csv => Empty file
[ERROR] ISSC.csv => Empty file
[ERROR] KNDI.csv => Empty file
[ERROR] TRNS.csv => Empty file
[ERROR] SKBL.csv => Empty file
[ERROR] UAL.csv => Empty file
[ERROR] TROW.csv => Empty file
[ERROR] RMSGW.csv => Empty file
[ERROR] ISRG.csv => Empty file
[ERROR] ISPO.csv => Empty file
[ERROR] MVST.csv => Empty file
[ERROR] SDHIU.csv => Empty file
[ERROR] TELA.csv => Empty file
[ERROR] LFMDP.csv => Empty file
[ERROR] TLRY.csv => Empty file
[ERROR] LECO.csv => Empty file
[ERROR] VABK.csv => Empty file
[ERROR] VCYT.csv => Empty file
[ERROR] IMNM.csv => Empty file
[ERROR] QETAU.csv => Empty file
[ERROR] SIMA.csv => Empty file
[ERROR] ORRF.csv => Empty file
[ERROR]