In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tensorflow
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Function to read CSV files
def read_csv_files():
    df_features = pd.read_csv(r"D:\JupyterNotebooks\Features data set.csv")
    df_sales = pd.read_csv(r"D:\JupyterNotebooks\sales data-set.csv")
    df_stores = pd.read_csv(r"D:\JupyterNotebooks\stores data-set.csv")
    return df_features, df_sales, df_stores

# Function to process date columns
def process_dates(df_features, df_sales):
    df_features['Date'] = pd.to_datetime(df_features['Date'], format='mixed', errors='coerce')
    df_sales['Date'] = pd.to_datetime(df_sales['Date'], format='mixed', errors='coerce')
    return df_features, df_sales

# Function to merge datasets
def merge_datasets(df_sales, df_features, df_stores):
    df = pd.merge(df_sales, df_features, how='left', on=['Store', 'Date', 'IsHoliday'])
    df_new = pd.merge(df, df_stores, how='left', on='Store')
    return df_new

# Function to fill missing values
def fill_missing_values(df):
    df_filled = df.fillna(0)
    return df_filled

# Function to extract date components
def extract_date_components(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    return df

# Function to drop the Date column
def drop_date_column(df):
    df.drop(columns=['Date'], inplace=True)
    return df

# Function to separate numerical and categorical variables
def separate_variables(df):
    cat_cols = ['Store', 'Dept', 'IsHoliday', 'Type']
    num_cols = df.columns.drop(cat_cols)
    return cat_cols, num_cols

# Function to encode categorical features
def encode_categorical_features(df, cat_cols):
    label_encoder = LabelEncoder()
    df[cat_cols] = df[cat_cols].apply(lambda x: label_encoder.fit_transform(x.astype(str)))
    return df

# Function to standardize numerical features
def standardize_numerical_features(df, num_cols):
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

# Function to drop outliers using z-score method
def drop_outliers_zscore(df, columns):
    z_scores = np.abs((df[columns] - df[columns].mean()) / df[columns].std())
    df_cleaned = df[(z_scores <= 3).all(axis=1)]  # Threshold z-score of 3 for outliers
    return df_cleaned

# Function to save columns before dropping
def save_dropped_columns(df, cols_to_drop):
    dropped_columns = df[cols_to_drop].copy()
    return dropped_columns

# Main preprocessing function
def preprocessing():
    df_features, df_sales, df_stores = read_csv_files()
    df_features, df_sales = process_dates(df_features, df_sales)
    df_merged = merge_datasets(df_sales, df_features, df_stores)
    df_filled = fill_missing_values(df_merged)
    df_extracted = extract_date_components(df_filled)
    df_no_date = drop_date_column(df_extracted)
    cat_cols, num_cols = separate_variables(df_no_date)
    df_encoded = encode_categorical_features(df_no_date, cat_cols)
    df_standardized = standardize_numerical_features(df_encoded, num_cols)
    df_cleaned = drop_outliers_zscore(df_standardized, num_cols)

    # Save columns before dropping
    cols_to_drop = ['IsHoliday', 'Temperature', 'Fuel_Price', 'Day', 'Year']
    dropped_columns = save_dropped_columns(df_cleaned, cols_to_drop)

    # Drop columns
    df_final = df_cleaned.drop(cols_to_drop, axis=1)

    return df_final, dropped_columns

# Usage

# df_extracted = extract_date_components(df_filled)
# df_no_date = drop_date_column(df_extracted)
# cat_cols, num_cols = separate_variables(df_no_date)
# df_encoded = encode_categorical_features(df_no_date, cat_cols)
# df_standardized = standardize_numerical_features(df_encoded, num_cols)
# df_cleaned = drop_outliers_zscore(df_standardized, num_cols)


In [3]:
def add_id_column(df):
    df['ID'] = df.reset_index().index + 1
    return df




In [4]:
print('Preprocessing file executed successfully.')

Preprocessing file executed successfully.
