In [1]:
import os
import pandas as pd
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Get a list of all files in the "data" directory
files = os.listdir("data")

# Create a dictionary where the keys are filenames (without extension) and values are DataFrames
data = {
    os.path.splitext(file)[0]: pd.read_csv(os.path.join("data", file))
    for file in files if file.endswith('.csv')
}


In [2]:
files

['ameriplex_fuel_sales_updated.csv',
 'ameriplex_sales_regenerated.csv',
 'cumulative_sales_regenerated.csv',
 'fail_road_sales_regenerated.csv',
 'rolling_prairie_sales_regenerated.csv',
 'winona_sales_regenerated.csv']

In [3]:
# Function to clean and prepare the data
def clean_and_concatenate_dataframes(*dataframes):
    cleaned_dfs = []
    
    for location_name, df in dataframes:
        print(f"Cleaning dataframe for {location_name} sales data...")
        
        # Convert 'Month' column to datetime if not already
        df['Month'] = pd.to_datetime(df['Month'])
        
        # Ensure 'Sales' column is of integer type
        df['Sales'] = df['Sales'].astype(int)
        
        # Add the location name as a new column
        if location_name:
            df['Location'] = location_name
        
        # Remove leading/trailing spaces from text columns (if applicable)
        if 'Product Category' in df.columns:
            df['Product Category'] = df['Product Category'].str.strip()
        
        # Check for and remove duplicate rows
        original_length = len(df)
        df.drop_duplicates(inplace=True)
        duplicates_removed = original_length - len(df)
        print(f"Removed {duplicates_removed} duplicate rows in {location_name}.")
        
        # Count missing values before imputation
        missing_before_impute = df.isnull().sum()
        print(f"Missing values before imputation for {location_name}:\n{missing_before_impute}")
        
        # Handle missing values using forward fill, backward fill, or interpolation
        # Forward fill: Use previous value to fill NaNs
        df['Sales'].ffill(inplace=True)
        
        # Alternatively, you could use backward fill:
        # df['Sales'].bfill(inplace=True)
        
        # Or use linear interpolation if the missing values are not too sparse
        # df['Sales'] = df['Sales'].interpolate(method='linear')
        
        # Count missing values after imputation
        missing_after_impute = df.isnull().sum()
        print(f"Missing values after imputation for {location_name}:\n{missing_after_impute}\n\n")

        # Saving the cleaned dataframes into a directory
        df.to_csv(os.path.join("CleanedData", location_name + "_cleaned.csv"))
        
        cleaned_dfs.append(df)
    
    # Concatenate all dataframes into a single dataframe
    product_sales_data = pd.concat(cleaned_dfs, ignore_index=True)
    
    return product_sales_data


In [4]:
# Clean and concatenate the dataframes
product_sales_data = clean_and_concatenate_dataframes(
    ("ameriplex", data["ameriplex_sales_regenerated"]),
    ("fail_road", data["fail_road_sales_regenerated"]),
    ("rolling_prairie", data["rolling_prairie_sales_regenerated"]),
    ("winona", data["winona_sales_regenerated"])
)

Cleaning dataframe for ameriplex sales data...
Removed 0 duplicate rows in ameriplex.
Missing values before imputation for ameriplex:
Month               0
Product Category    0
Sales               0
Location            0
dtype: int64
Missing values after imputation for ameriplex:
Month               0
Product Category    0
Sales               0
Location            0
dtype: int64


Cleaning dataframe for fail_road sales data...
Removed 0 duplicate rows in fail_road.
Missing values before imputation for fail_road:
Month               0
Product Category    0
Sales               0
Location            0
dtype: int64
Missing values after imputation for fail_road:
Month               0
Product Category    0
Sales               0
Location            0
dtype: int64


Cleaning dataframe for rolling_prairie sales data...
Removed 0 duplicate rows in rolling_prairie.
Missing values before imputation for rolling_prairie:
Month               0
Product Category    0
Sales               0
Location    

In [5]:
# Display the combined dataframe
product_sales_data.to_csv(os.path.join("CleanedData", "product_sales_data_with_locations.csv"))

In [6]:
# Clean and concatenate the ameriplex fuel sales data and cumulative sales data

clean_and_concatenate_dataframes(
    ("ameriplex_fuel", data["ameriplex_fuel_sales_updated"]))
clean_and_concatenate_dataframes(
    ("", data["cumulative_sales_regenerated"]))


Cleaning dataframe for ameriplex_fuel sales data...
Removed 0 duplicate rows in ameriplex_fuel.
Missing values before imputation for ameriplex_fuel:
Month        0
Fuel Type    0
Sales        0
Location     0
dtype: int64
Missing values after imputation for ameriplex_fuel:
Month        0
Fuel Type    0
Sales        0
Location     0
dtype: int64


Cleaning dataframe for  sales data...
Removed 0 duplicate rows in .
Missing values before imputation for :
Month               0
Product Category    0
Sales               0
dtype: int64
Missing values after imputation for :
Month               0
Product Category    0
Sales               0
dtype: int64




Unnamed: 0,Month,Product Category,Sales
0,2022-01-01,Cigarettes,1016
1,2022-01-01,Other Tobacco,971
2,2022-01-01,Beer,1072
3,2022-01-01,Wine,806
4,2022-01-01,Packaged Beverages-nonalcoh,1230
...,...,...,...
1171,2023-12-01,Money Transfers,1138
1172,2023-12-01,No Scan Merch Radiant,1067
1173,2023-12-01,Novelty,1015
1174,2023-12-01,Phone Card Fee,1330
