In [8]:
import pandas as pd
import os

# for category mapping
COLUMN_MAP = {
    "Date": ["date", "Date", "timestamp", "Datetime"],
    "Open": ["open", "Open", "OPEN", "Open Price"],
    "High": ["high", "High", "HIGH"],
    "Low": ["low", "Low", "LOW"],
    "Close": ["Close", "close", "CLOSE","Close/Last","CLOSE/LAST","close/last"],
    "Adj close": ["Adj Close", "adj_close", "adj close"],
    "Volume": ["volume", "Volume", "VOL", "vol"]
}

def standardize_columns(df):
    """Rename CSV columns to standard names."""
    new_cols = {}
    for std_col, aliases in COLUMN_MAP.items():
        for alias in aliases:
            if alias in df.columns:
                new_cols[alias] = std_col
                break
    df = df.rename(columns=new_cols)
    return df

def load_and_clean_csv(file_path):
    """Load and clean a single CSV file."""
    df = pd.read_csv(file_path)
    df = standardize_columns(df)

    required = ["Date", "Open", "High", "Low", "Close", "Volume"]
    for col in required:
        if col not in df.columns:
            df[col] = None  # create missing column with NaN values
    
    # drop rows if critical columns are missing
    df = df.dropna(subset=["Date", "Close"])

    # converts to a specific format: prices
    numeric_cols = ["Open", "High", "Low", "Close", "Adj close", "Volume"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].replace('[\$,]', '', regex=True)  # remove $ and ,
            df[col] = pd.to_numeric(df[col], errors="coerce")  # convert to float

    # converts to a specific format: dates
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df.dropna(subset=["Date"])
    df = df.sort_values(by="Date").drop_duplicates(subset=["Date"])

    return df


if __name__ == "__main__":
    file_path = "nvidia-stocks.csv"  # Change this to your CSV file
    cleaned_df = load_and_clean_csv(file_path)

    print("CLEANED DATASET PREVIEW:")
    print(cleaned_df.head(10))

    # Create "clean" folder if it doesn't exist
    output_folder = "clean"
    os.makedirs(output_folder, exist_ok=True)

    # Add "clean_" prefix to filename
    original_name = os.path.basename(file_path) 
    clean_name = "clean_" + original_name
    output_path = os.path.join(output_folder, clean_name)

    if not os.path.exists(output_path):
        cleaned_df.to_csv(output_path, index=False)
        print(f"Saved cleaned file to: {output_path}")
    else:
        print(f"File already exists: {output_path}")


CLEANED DATASET PREVIEW:
        Date  Open  High   Low  Close  Adj close  Change     Volume
0 2015-08-13  0.59  0.60  0.59   0.59       0.57  -0.84%  276885840
1 2015-08-14  0.59  0.59  0.58   0.59       0.57   0.09%  214650440
2 2015-08-17  0.59  0.59  0.58   0.58       0.57  -0.64%  273878920
3 2015-08-18  0.58  0.58  0.57   0.58       0.56  -1.33%  287082200
4 2015-08-19  0.58  0.58  0.57   0.57       0.56  -0.39%  249273680
5 2015-08-20  0.57  0.57  0.55   0.55       0.54  -3.61%  426933160
6 2015-08-21  0.55  0.56  0.54   0.54       0.52  -3.11%  382665560
7 2015-08-24  0.51  0.55  0.50   0.52       0.50  -3.54%  682282920
8 2015-08-25  0.54  0.54  0.51   0.51       0.49  -2.08%  622684400
9 2015-08-26  0.52  0.55  0.52   0.55       0.53   7.54%  614977560
Saved cleaned file to: clean\clean_nvidia-stocks.csv


  df[col] = df[col].replace('[\$,]', '', regex=True)  # remove $ and ,


In [4]:
cleaned_df

Unnamed: 0,Date,Close,Volume,Open,High,Low
0,2015-08-10,47.33,23045530,46.95,47.49,46.8400
1,2015-08-11,46.41,28757610,46.82,46.94,45.9000
2,2015-08-12,46.74,30184710,46.19,46.90,45.7050
3,2015-08-13,46.73,22612650,47.06,47.10,46.4900
4,2015-08-14,47.00,21460830,46.53,47.10,46.5200
...,...,...,...,...,...,...
2510,2025-08-04,535.64,25349000,528.27,538.25,528.1300
2511,2025-08-05,527.75,19171570,537.18,537.30,527.2400
2512,2025-08-06,524.94,21355700,530.90,531.70,524.0300
2513,2025-08-07,520.84,16079140,526.80,528.09,517.5511
