In [12]:
import pandas as pd

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_customers_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())
    
    # Step 2: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')
    
    # Step 3: Convert columns to appropriate data types
    df['customer_id'] = df['customer_id'].astype('string')
    df['customer_unique_id'] = df['customer_unique_id'].astype('string')
    df['customer_zip_code_prefix'] = df['customer_zip_code_prefix'].astype('string')
    df['customer_city'] = df['customer_city'].astype('category') 
    df['customer_state'] = df['customer_state'].astype('category')
    
    # Step 4: Normalize `customer_zip_code_prefix` to ensure all are 5 characters
    df['customer_zip_code_prefix'] = df['customer_zip_code_prefix'].str.zfill(5)

    # Step 5: Drop duplicates based on the specified subset of columns
    df = df.drop_duplicates(subset=['customer_unique_id', 'customer_zip_code_prefix'])

    # Step 6: Drop customer id column  based on the specified subset of columns
    df = df.drop(columns="customer_id")
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_customers_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_customers_dataset.csv


In [13]:
import pandas as pd
from unidecode import unidecode

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_geolocation_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Remove duplicates in `geolocation_zip_code_prefix`, keeping the first occurrence
    df['geolocation_city'] = df['geolocation_city'].apply(lambda x: unidecode(x))
    df = df.drop_duplicates(subset='geolocation_zip_code_prefix')
    
    # Step 2: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 3: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')
    
    # Step 4: Convert columns to appropriate data types
    df['geolocation_zip_code_prefix'] = df['geolocation_zip_code_prefix'].astype('string')
    df['geolocation_lat'] = df['geolocation_lat'].astype('float64')
    df['geolocation_lng'] = df['geolocation_lng'].astype('float64')
    df['geolocation_city'] = df['geolocation_city'].astype('category')
    df['geolocation_state'] = df['geolocation_state'].astype('category')

    # Step 5: Normalize `geolocation_zip_code_prefix` to ensure all are 5 characters
    df['geolocation_zip_code_prefix'] = df['geolocation_zip_code_prefix'].str.zfill(5)
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_geolocation_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_geolocation_dataset.csv


In [14]:
import pandas as pd
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_order_items_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):    
    # Step 1: Remove duplicates in `order_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='order_id')
    
    # Step 2: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 3: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')

    # Step 4: Convert columns to appropriate data types
    df['order_id'] = df['order_id'].astype('string')
    df['order_item_id'] = df['order_item_id'].astype('int64')
    df['product_id'] = df['product_id'].astype('string')
    df['seller_id'] = df['seller_id'].astype('string')
    df['shipping_limit_date'] = pd.to_datetime(df['shipping_limit_date'])
    df['price'] = df['price'].apply(Decimal)
    df['freight_value'] = df['freight_value'].apply(Decimal)
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_items_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_items_dataset.csv


In [15]:
import pandas as pd
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_order_payments_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Remove duplicates in `order_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='order_id')
    
    # Step 2: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 3: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')
    
    # Step 4: Convert columns to appropriate data types
    df['order_id'] = df['order_id'].astype('string')
    df['payment_sequential'] = df['payment_sequential'].astype('int8')
    df['payment_type'] = df['payment_type'].astype('category')
    df['payment_installments'] = df['payment_installments'].astype('int8')
    df['payment_value'] = df['payment_value'].apply(Decimal)
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_payments_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_payments_dataset.csv


In [16]:
import pandas as pd
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_order_reviews_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Remove duplicates in `order_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='order_id')
    
    # Step 2: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 3: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')

    # Step 4: Convert columns to appropriate data types
    df['review_id'] = df['review_id'].astype('string')
    df['order_id'] = df['order_id'].astype('string')
    df['review_score'] = df['review_score'].astype('int64')
    df['review_comment_title'] = df['review_comment_title'].astype('string')
    df['review_comment_message'] = df['review_comment_message'].astype('string')
    df['review_creation_date'] = pd.to_datetime(df['review_creation_date'])
    df['review_answer_timestamp'] = pd.to_datetime(df['review_answer_timestamp']).dt.date
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_reviews_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_reviews_dataset.csv


In [17]:
import pandas as pd
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_orders_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Convert columns to appropriate data types
    df['order_id'] = df['order_id'].astype('string')
    df['customer_id'] = df['customer_id'].astype('string')
    df['order_status'] = df['order_status'].astype('string')
    df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp']).dt.floor('h')
    df['order_approved_at'] = pd.to_datetime(df['order_approved_at']).dt.floor('h')
    df['order_delivered_carrier_date'] = pd.to_datetime(df['order_delivered_carrier_date']).dt.floor('h')
    df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date']).dt.floor('h')
    df['order_estimated_delivery_date'] = pd.to_datetime(df['order_estimated_delivery_date'])

    # Step 2: Remove duplicates in `order_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='order_id')
    
    # Step 3: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 4: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')

    # Step 5: Convert 'order_status' to category
    df['order_status'] = df['order_status'].astype('category')

    # Step 6: Add customer_unique_id column to orders dataset, then drop the customer_id column
    customers = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_customers_dataset.csv')
    df = df.merge(customers[['customer_id', 'customer_unique_id']], left_on='customer_id', right_on='customer_id', how='left')
    df = df.drop(columns=['customer_id'])
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_orders_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_orders_dataset.csv


In [18]:
import pandas as pd
import numpy as np
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_products_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df): 
    # Step 1: Translate Portugese category names to English
    products_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_products_dataset.csv'
    translation_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/product_category_name_translation.csv'
    products_df = pd.read_csv(products_path)
    translation_df = pd.read_csv(translation_path)
    
    merged_df = products_df.merge(translation_df, on='product_category_name', how='left')
    merged_df.insert(1, 'product_category_name_english_merged', merged_df['product_category_name_english'])
    merged_dropped_df = merged_df.drop(['product_category_name', 'product_category_name_english'], axis=1)  
    df = merged_dropped_df
    
    # Step 2: Remove duplicates in `product_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='product_id')
    
    # Step 3: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 4: Replace NULL values with the string "N/A" in the string columns:
    str_columns = ['product_id', 'product_category_name_english_merged']
    df[str_columns] = df[str_columns].fillna('N/A')

    # Step 5: Replace NaN or inf with 0 in the integer columns:
    int_columns = ['product_name_lenght', 'product_description_lenght', 'product_photos_qty',
                   'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']
    df[int_columns] = df[int_columns].replace([np.nan, np.inf, -np.inf], 0)

    # Step 6: Convert columns to appropriate data types
    df['product_id'] = df['product_id'].astype('string')
    df['product_category_name_english_merged'] = df['product_category_name_english_merged'].astype('category')
    df['product_name_lenght'] = df['product_name_lenght'].astype('int64')  
    df['product_description_lenght'] = df['product_description_lenght'].astype('int64')  
    df['product_photos_qty'] = df['product_photos_qty'].astype('int64')  
    df['product_weight_g'] = df['product_weight_g'].astype('int64') 
    df['product_length_cm'] = df['product_length_cm'].astype('int64') 
    df['product_height_cm'] = df['product_height_cm'].astype('int64') 
    df['product_width_cm'] = df['product_width_cm'].astype('int64')

    # Step 7: Rename columns 'product_name_lenght' and 'product_description_lenght' to the correct spelling of 'length'
    df = df.rename(columns={'product_name_lenght': 'product_name_length', 'product_description_lenght' : 'product_description_length'})
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_products_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_products_dataset.csv


In [19]:
import pandas as pd
from decimal import Decimal

# Load the dataset
file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Olist Data/olist_sellers_dataset.csv'
data = pd.read_csv(file_path)

# Clean the data
def clean_data(df):
    # Step 1: Remove duplicates in `product_id` if any, keeping the first occurrence
    df = df.drop_duplicates(subset='seller_id')
    
    # Step 2: Strip leading/trailing whitespace from all string columns
    str_columns = df.select_dtypes(include='string').columns
    df[str_columns] = df[str_columns].apply(lambda col: col.str.strip())

    # Step 3: Replace NULL values with the string "N/A":
    df = df.fillna('N/A')

    # Step 4: Convert columns to appropriate data types
    df['seller_id'] = df['seller_id'].astype('string')
    df['seller_zip_code_prefix'] = df['seller_zip_code_prefix'].astype('string')
    df['seller_city'] = df['seller_city'].astype('category') 
    df['seller_state'] = df['seller_state'].astype('category')

    # Step 5: Normalize `customer_zip_code_prefix` to ensure all are 5 characters
    df['seller_zip_code_prefix'] = df['seller_zip_code_prefix'].str.zfill(5)
    
    # Return the cleaned dataframe
    return df

# Clean the dataset
cleaned_data = clean_data(data)

# Save the cleaned dataset to a new file
cleaned_file_path = '/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_sellers_dataset.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to /Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_sellers_dataset.csv


In [20]:
# Connection to and Creation of tables in PostgreSQL using Python

from sqlalchemy import create_engine
import pandas as pd
import psycopg2

# Database connection string (update with your actual credentials)
db_username = 'postgres'  # Replace with your PostgreSQL username
db_password = 'password'  # Replace with your PostgreSQL password
db_host = 'localhost'        # Replace with your host if not local
db_port = 5432               # Default PostgreSQL port
db_name = 'OlistDatabase'    # Replace with your database name

# Create SQLAlchemy engine
try:
    engine = create_engine(f'postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}')
    connection = engine.connect()
    print("Connection successful.")
    connection.close()
except Exception as e:
    print(f"Error: {e}")
    exit()  # Stop the script if connection fails

# Load DataFrames from CSV files (update paths with correct files)
try:
    customers = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_customers_dataset.csv')
    geolocation = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_geolocation_dataset.csv')
    order_items = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_items_dataset.csv')
    order_payments = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_payments_dataset.csv')
    order_reviews = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_order_reviews_dataset.csv')
    orders = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_orders_dataset.csv')
    products = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_products_dataset.csv')
    sellers = pd.read_csv('/Users/shawnwee/teams notes_Generation SCTP JDE 05/Week 8 Final Project/Cleaned Olist Data/cleaned_olist_sellers_dataset.csv')
    print("DataFrames loaded successfully.")
except Exception as e:
    print(f"Error loading CSV files: {e}")
    exit()  # Stop the script if data loading fails

# Write DataFrames to PostgreSQL
try:
    customers.to_sql('customers', engine, if_exists='replace', index=False)
    geolocation.to_sql('geolocation', engine, if_exists='replace', index=False)
    order_items.to_sql('order_items', engine, if_exists='replace', index=False)
    order_payments.to_sql('order_payments', engine, if_exists='replace', index=False)
    order_reviews.to_sql('order_reviews', engine, if_exists='replace', index=False)
    orders.to_sql('orders', engine, if_exists='replace', index=False)
    products.to_sql('products', engine, if_exists='replace', index=False)
    sellers.to_sql('sellers', engine, if_exists='replace', index=False)
    print("DataFrames written to PostgreSQL successfully.")
except Exception as e:
    print(f"Error writing to PostgreSQL: {e}")

Connection successful.
DataFrames loaded successfully.
DataFrames written to PostgreSQL successfully.
