In [None]:
import pandas as pd
import os

# for category mapping
COLUMN_MAP = {
    "date": ["date", "Date", "timestamp", "Datetime"],
    "open": ["open", "Open", "OPEN", "Open Price"],
    "high": ["high", "High", "HIGH"],
    "low": ["low", "Low", "LOW"],
    "close": ["Close", "close", "CLOSE","Close/Last","CLOSE/LAST","close/last"],
    "adj_close": ["Adj Close", "adj_close", "adj close"],
    "volume": ["volume", "Volume", "VOL", "vol"]
}

def standardize_columns(df):
    """Rename CSV columns to standard names."""
    new_cols = {}
    for std_col, aliases in COLUMN_MAP.items():
        for alias in aliases:
            if alias in df.columns:
                new_cols[alias] = std_col
                break
    df = df.rename(columns=new_cols)
    return df

def load_and_clean_csv(file_path):
    """Load and clean a single CSV file."""
    df = pd.read_csv(file_path)
    df = standardize_columns(df)

    required = ["date", "open", "high", "low", "close", "volume"]
    for col in required:
        if col not in df.columns:
            df[col] = None  # create missing column with NaN values
    
    # drop rows if critical columns are missing
    df = df.dropna(subset=["date", "close"])

    # converts to a specific format: prices
    numeric_cols = ["open", "high", "low", "close", "adj_close", "volume"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].replace('[\$,]', '', regex=True)  # remove $ and ,
            df[col] = pd.to_numeric(df[col], errors="coerce")  # convert to float

    # converts to a specific format: dates
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["date"])
    df = df.sort_values(by="date").drop_duplicates(subset=["date"])

    return df


if __name__ == "__main__":
    file_path = "ms-hd.csv"  # Change this to your CSV file
    cleaned_df = load_and_clean_csv(file_path)

    print("CLEANED DATASET PREVIEW:")
    print(cleaned_df.head(10))

    # Create "clean" folder if it doesn't exist
    output_folder = "clean"
    os.makedirs(output_folder, exist_ok=True)

    # Add "clean_" prefix to filename
    original_name = os.path.basename(file_path) 
    clean_name = "clean_" + original_name
    output_path = os.path.join(output_folder, clean_name)

    if not os.path.exists(output_path):
        cleaned_df.to_csv(output_path, index=False)
        print(f"Saved cleaned file to: {output_path}")
    else:
        print(f"File already exists: {output_path}")



In [None]:
cleaned_df