In [1]:
# preprocessing

In [34]:
import os
import pandas as pd

# Define the directory for your data
DATA_DIR = "data"

def remove_second_row(file_path):
    """Remove the second row from the CSV file."""
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Drop the second row (index 1, because indexing starts at 0)
    df = df.drop(index=1)
    
    # Reset index to ensure the data is continuous
    df.reset_index(drop=True, inplace=True)
    
    return df

# Process all CSV files in the data directory
for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        file_path = os.path.join(DATA_DIR, file)
        print(f"Processing: {file}")

        # Remove the second row from the data
        df_cleaned = remove_second_row(file_path)

        # Save the cleaned data back to the original file
        df_cleaned.to_csv(file_path, index=False)
        print(f"✅ Saved cleaned data back to original: {file_path}")

print("✅ All datasets have been processed and saved!")


Processing: AMZN.csv
✅ Saved cleaned data back to original: data\AMZN.csv
Processing: GOOGL.csv
✅ Saved cleaned data back to original: data\GOOGL.csv
Processing: JPM.csv
✅ Saved cleaned data back to original: data\JPM.csv
Processing: META.csv
✅ Saved cleaned data back to original: data\META.csv
Processing: MSFT.csv
✅ Saved cleaned data back to original: data\MSFT.csv
Processing: NFLX.csv
✅ Saved cleaned data back to original: data\NFLX.csv
Processing: NVDA.csv
✅ Saved cleaned data back to original: data\NVDA.csv
Processing: TSLA.csv
✅ Saved cleaned data back to original: data\TSLA.csv
Processing: V.csv
✅ Saved cleaned data back to original: data\V.csv
✅ All datasets have been processed and saved!


In [37]:
import os
import pandas as pd

# Define directories
DATA_DIR = "data"
CLEANED_DATA_DIR = "data_cleaned"

# Create directory for cleaned data
os.makedirs(CLEANED_DATA_DIR, exist_ok=True)

def preprocess_stock_data(file_path):
    """Preprocess stock data: rename 'Price' column to 'Date', clean missing values, and ensure correct datatypes."""
    
    # Read the data
    df = pd.read_csv(file_path)

    # Check if the 'Price' column exists, then rename it to 'Date'
    if "Price" in df.columns:
        df.rename(columns={"Price": "Date"}, inplace=True)
    else:
        print(f"⚠ Warning: 'Price' column not found in {file_path}. Skipping file.")
        return None

    # Ensure all required columns exist (after renaming)
    required_cols = {"Date", "Close", "High", "Low", "Open", "Volume"}
    if not required_cols.issubset(df.columns):
        print(f"⚠ Warning: Missing required columns in {file_path}")
        return None

    # Convert columns (except "Date") to numeric (handle any non-numeric values)
    numeric_cols = ["Close", "High", "Low", "Open", "Volume"]
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values in numeric columns
    df.dropna(subset=numeric_cols, how="all", inplace=True)

    # Ensure "Date" column contains valid datetime values
    try:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    except Exception as e:
        print(f"⚠ Error converting 'Date' column in {file_path}: {e}")
        return None

    # Drop rows where "Date" is invalid
    df.dropna(subset=["Date"], inplace=True)

    # Sort the dataframe by date (optional)
    df.sort_values(by="Date", inplace=True)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # Return the cleaned dataframe
    return df

# Process all CSV files in the data directory
for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        file_path = os.path.join(DATA_DIR, file)
        print(f"Processing: {file}")

        # Preprocess stock data
        df_cleaned = preprocess_stock_data(file_path)

        if df_cleaned is not None:
            # Save cleaned data
            cleaned_file_path = os.path.join(CLEANED_DATA_DIR, file)
            df_cleaned.to_csv(cleaned_file_path, index=False)
            print(f"✅ Saved cleaned data: {cleaned_file_path}")

print("✅ All datasets have been preprocessed successfully!")

Processing: AMZN.csv
✅ Saved cleaned data: data_cleaned\AMZN.csv
Processing: GOOGL.csv
✅ Saved cleaned data: data_cleaned\GOOGL.csv
Processing: JPM.csv
✅ Saved cleaned data: data_cleaned\JPM.csv
Processing: META.csv
✅ Saved cleaned data: data_cleaned\META.csv
Processing: MSFT.csv
✅ Saved cleaned data: data_cleaned\MSFT.csv
Processing: NFLX.csv
✅ Saved cleaned data: data_cleaned\NFLX.csv
Processing: NVDA.csv
✅ Saved cleaned data: data_cleaned\NVDA.csv
Processing: TSLA.csv
✅ Saved cleaned data: data_cleaned\TSLA.csv
Processing: V.csv
✅ Saved cleaned data: data_cleaned\V.csv
✅ All datasets have been preprocessed successfully!
