In [1]:
import pandas as pd
import os
import datetime as dt

In [2]:
filepath = '../data/online_retail_sales_dataset.csv'

In [3]:
def load_data(filepath):
    # Step 1: Check type
    if not isinstance(filepath, str):
        return None

    # Step 2: Check empty string or spaces only
    if filepath.strip() == "":
        return None

    # Step 3: Check file exists
    if not os.path.exists(filepath):
        return None

    # Step 4: Try reading the CSV
    try:
        df = pd.read_csv(filepath)
        return df
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

In [4]:
load_data(filepath)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536366,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:28:00,2.75,17850,United Kingdom
3,536367,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:34:00,3.39,13047,United Kingdom
4,536368,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:35:00,3.39,13047,United Kingdom
5,536369,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:45:00,7.65,12583,France
6,536370,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:50:00,4.25,12583,France


In [5]:
def clean_data(df):
    df = df.copy()

    df.dropna(subset=['CustomerID'], inplace=True)

    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df = df[df['Quantity'] >= 0]

    df['UnitPrice'] = pd.to_numeric(df['UnitPrice'], errors='coerce')
    df = df[df['UnitPrice'] >= 0]

    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
    df = df.dropna(subset=['InvoiceDate'])

    df = df.drop_duplicates()

    df.dropna(subset=['Description'], inplace=True)

    return df
    

In [6]:
def calculate_revenue(df):
    if 'UnitPrice' not in df.columns or 'Quantity' not in df.columns:
        return None

    # Calculate revenue
    df['Revenue'] = df['UnitPrice'] * df['Quantity']
    
    return df

def aggregate_revenue_by_country(df):
    if 'Country' not in df.columns:
        return None
    
    country_result = df.groupby('Country')['Revenue'].sum()

    return country_result

def aggregate_revenue_by_product(df):
    if 'Description' not in df.columns:
        return None

    description_result = df.groupby('Description')['Revenue'].sum()

    return description_result

In [None]:
def filter_data_by_date(df, start_date, end_date):
    # 1. Validate input types isinstance(object_to_check, expected_type)
    if not isinstance(df, pd.DataFrame):
        return None
    
    # 2. Check if 'InvoiceDate' column exists
    if 'InvoiceDate' not in df.columns:
        return None    
    
    # 3. Convert dates to datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], error='coerce')

    # 4. Filter using boolean mask
    start_date = pd.to_datetime(start_date, errors='coerce')
    end_date = pd.to_datetime(end_date, errors='coerce')
    
    if pd.isna(start_date) or pd.isna(end_date):
        return None

    mask = (df['InvoiceDate'] >= start_date) & (df['InvoiceDate'] <= end_date)

    # 5. Return filtered df
    return df[mask]

In [None]:
def save_clean_data(df, output_path):
    