In [17]:
import pandas as pd

def clean_olist_order_reviews(input_path, output_path):

    df = pd.read_csv(input_path)

    # Convert 'review_score' to numeric, setting invalid values to NaN
    df['review_score'] = pd.to_numeric(df['review_score'], errors='coerce')

    # Convert 'review_creation_date' and 'review_answer_timestamp' to datetime
    df['review_creation_date'] = pd.to_datetime(df['review_creation_date'], errors='coerce')
    df['review_answer_timestamp'] = pd.to_datetime(df['review_answer_timestamp'], errors='coerce')

    # Remove any carriage return characters
    df.replace({'\r': ''}, regex=True, inplace=True)

    # Save cleaned data
    df.to_csv(output_path, index=False, header=False)  # Save without headers



def clean_olist_order_items(input_path, output_path):
    # Load the data
    df = pd.read_csv(input_path)

    # Convert 'order_item_id' to integer, setting invalid values to NaN
    df['order_item_id'] = pd.to_numeric(df['order_item_id'], errors='coerce')
    # Convert 'price' and 'freight_value' to float, setting invalid values to NaN
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['shipping_limit_date'] = pd.to_datetime(df['shipping_limit_date'], errors='coerce')

    df.dropna(subset=['shipping_limit_date', 'price'], inplace=True)
    df['freight_value'] = pd.to_numeric(df['freight_value'], errors='coerce')
    df.drop(columns=['shipping_limit_date'], inplace=True)

    # Drop any rows with more data than there are columns
    df = df[df.columns[:len(df.columns)]]

    # Save cleaned data
    df.to_csv(output_path, index=False, header=False)  # Save without headers




def clean_olist_order_payments(input_path, output_path):
    # Load the data
    df = pd.read_csv(input_path)

    # Convert 'payment_sequential' and 'payment_installments' to integer, setting invalid values to NaN
    df['payment_sequential'] = pd.to_numeric(df['payment_sequential'], errors='coerce')
    df['payment_installments'] = pd.to_numeric(df['payment_installments'], errors='coerce')

    # Convert 'payment_value' to float, setting invalid values to NaN
    df['payment_value'] = pd.to_numeric(df['payment_value'], errors='coerce')

    # Save cleaned data
    df.to_csv(output_path, index=False, header=False)  # Save without headers


# Paths to the input files and output cleaned files
input_paths = {
    'reviews': '/Users/anastasyarussu/Downloads/archive/olist_order_reviews_dataset.csv',
    'items': '/Users/anastasyarussu/Downloads/archive/olist_order_items_dataset.csv',
    'payments': '/Users/anastasyarussu/Downloads/archive/olist_order_payments_dataset.csv',
}

output_paths = {
    'reviews': '/Users/anastasyarussu/Downloads/archive/olist_order_reviews_dataset_cleaned.csv',
    'items': '/Users/anastasyarussu/Downloads/archive/olist_order_items_dataset_cleaned.csv',
    'payments': '/Users/anastasyarussu/Downloads/archive/olist_order_payments_dataset_cleaned.csv',
}

# Run cleaning functions for each file
clean_olist_order_reviews(input_paths['reviews'], output_paths['reviews'])
clean_olist_order_items(input_paths['items'], output_paths['items'])
clean_olist_order_payments(input_paths['payments'], output_paths['payments'])

print("Data cleaning completed. Cleaned files saved to specified output paths.")


Data cleaning completed. Cleaned files saved to specified output paths.
