In [15]:
import pandas as pd

In [16]:

# import data from a CSV file
missing_or_empty_files = []

#files to extract data from
required_files = ['customers_large.csv', 'orders_large.csv', 'returns_large.csv']

# check to be sure all required files are loaded, and if not, raise an error specifying which file is missing
def check_missing_or_empty(file_path):
    if file_path.empty or not isinstance(file_path, pd.DataFrame):
        missing_or_empty_files.append(file_path)

for file in required_files:
    try:
        df = pd.read_csv(f"../{file}")
        check_missing_or_empty(df)
    except FileNotFoundError:
        missing_or_empty_files.append(file)
    
if missing_or_empty_files:
    print(f"The following required files are missing or empty: {missing_or_empty_files}")
    print("Exiting program....")
    SystemExit(1)
else:
    print("All required files are present and loaded successfully.... proceeding to next step................")

All required files are present and loaded successfully.... proceeding to next step................


In [17]:
#Define dataframes for each file
customers_data = pd.read_csv('../customers_large.csv')
orders_data = pd.read_csv('../orders_large.csv')
returns_data = pd.read_csv('../returns_large.csv')

# check to be sure all required columns are present
required_customers_columns = ['customer_id', 'name', 'signup_date', 'region']
required_orders_columns = ['order_id', 'customer_id', 'order_date', 'amount', 'product_category']
required_returns_columns = ['order_id', 'return_date', 'return_reason']

missing_columns_error = []

#define a function to check for required columns
def check_required_columns(data, required_columns):
    for column in required_columns:
        if column not in data.columns:
            missing_columns_error.append(column)
            
check_required_columns(customers_data, required_customers_columns)
check_required_columns(orders_data, required_orders_columns)
check_required_columns(returns_data, required_returns_columns)

if missing_columns_error:
    print(f"The following required columns are missing: {', '.join(missing_columns_error)}")
    print("Exiting program....")
    SystemExit(1)
else:
    print("All required columns are present in the dataframes.... proceeding to next step................")

All required columns are present in the dataframes.... proceeding to next step................


In [18]:
#Re-save data as CSV files for transformation step without index

#check to be sure the extract directory exists and create it if it does not exist and delete any existing files in the extract directory
import os
import shutil
extract_dir = '../extract'

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

#delete any existing files in the extract directory
for filename in os.listdir(extract_dir):
    file_path = os.path.join(extract_dir, filename) # get the full file path
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path): # check if it's a file or a link
            os.unlink(file_path) # delete the file or link
        elif os.path.isdir(file_path): # check if it's a directory
            shutil.rmtree(file_path) # delete the directory and its contents
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}') # log any errors
        
#save the dataframes as CSV files in the extract directory
customers_data.to_csv('../extract/customers_large.csv', index=False)
orders_data.to_csv('../extract/orders_large.csv', index=False)
returns_data.to_csv('../extract/returns_large.csv', index=False)

rows_extracted = {
    'customers_large.csv': len(customers_data),
    'orders_large.csv': len(orders_data),
    'returns_large.csv': len(returns_data)
}
for file, rows in rows_extracted.items():
    print(f"Extracted {rows} rows from {file}")
    
print("Dataframes saved as CSV files in the extract directory successfully.... proceeding to next step................")
print("Extraction step completed successfully.")

Extracted 1000 rows from customers_large.csv
Extracted 5000 rows from orders_large.csv
Extracted 500 rows from returns_large.csv
Dataframes saved as CSV files in the extract directory successfully.... proceeding to next step................
Extraction step completed successfully.
