In [6]:
from mods import DatabaseConnector 
import mysql.connector
import os
import pandas as pd

In [7]:
# Create a new instance of DatabaseConnector
myDB = DatabaseConnector('localhost','root','root','final_project')
directory = r"C:\Users\marno\Wiley Edge\Final_Project\Code\data"

In [8]:
# Create a dictionary of table names and file paths
def get_csv_files(directory: str) -> dict[str, str]:
    tables = {}
    # For each file in a directory, add the file name (excluding the extension) and the file path to the dictionary
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            table_name = file.split(".")[0]
            file_path = os.path.join(directory, file).replace("\\", "/")
            tables[table_name] = file_path
    return tables


# Function to populate each table in the database by using the csv files in the directory.
def load_data_to_table(csv_file_path, table_name, myDB):
    try:
        # Connect to the database
        myDB.connect()
                
        # Build the SQL query
        query = f'LOAD DATA LOCAL INFILE "{csv_file_path}" INTO TABLE {table_name} FIELDS TERMINATED BY "," ENCLOSED BY \'"\' IGNORE 1 ROWS'

        # check if table is already populated
        update_query = f"SELECT COUNT(*) FROM {table_name}"
        result = myDB.execute_query(update_query)
        
        # If the table is not populated, load the data
        if result[0][0] == 0:
            myDB.update_query(query)
            print(f"Data from {csv_file_path} loaded into {table_name} successfully.")
        else:
            print(f"Table {table_name} is already populated.")

        print(f"Data from {csv_file_path} loaded into {table_name} successfully.")
    except Exception as e:
        print(f"Error loading data: {e}")
    finally:
        # Close the connection
        myDB.close_connection()
        
# Implement the function for each table to load all the data
def load_all_data():
    for table_name, file_path in get_csv_files(directory).items():
        load_data_to_table(file_path, table_name, myDB)
    
    
def import_data():
    for table_name in get_csv_files(directory).keys():
        myDB.import_data(table_name)

In [9]:
import_data()

Connected to database
Data from customers imported successfully.
Connected to database
Data from geolocation imported successfully.
Connected to database
Data from orders imported successfully.
Connected to database
Data from order_items imported successfully.
Connected to database
Data from order_payments imported successfully.
Connected to database
Data from order_reviews imported successfully.
Connected to database
Data from products imported successfully.
Connected to database
Data from product_category_translation imported successfully.
Connected to database
Data from sellers imported successfully.


In [10]:
def get_dataframes():
    return myDB.dataframes

def update_csv_files(dataframes_dict, directory_path):

    # Iterate through the dictionary
    for title, dataframe in dataframes_dict.items():
        # Construct the CSV file path
        csv_file_path = os.path.join(directory_path, f'{title}.csv')

        # Check if the CSV file already exists
        if os.path.isfile(csv_file_path):
            # If it exists, update the CSV file with the new dataframe
            dataframe.to_csv(csv_file_path, index=False)
            print(f'{title}.csv updated successfully.')
        else:
            # If it doesn't exist, create a new CSV file
            dataframe.to_csv(csv_file_path, index=False)
            print(f'{title}.csv created successfully.')
            
update_csv_files(get_dataframes(), directory)


customers.csv updated successfully.
geolocation.csv updated successfully.
orders.csv updated successfully.
order_items.csv updated successfully.
order_payments.csv updated successfully.
order_reviews.csv updated successfully.
products.csv updated successfully.
product_category_translation.csv updated successfully.
sellers.csv updated successfully.


In [12]:
import pandas as pd

# Replace the Portuguese category names with English names
def replace_category_names(products_df, translations_df):
    # Merge the Products table with the translations table based on the product category name
    merged_df = pd.merge(products_df, translations_df, on='product_category_name', how='left')

    # Replace the original category name column with the English names
    merged_df['product_category_name'] = merged_df['english_category_name']

    # Drop the redundant translated column
    merged_df = merged_df.drop('english_category_name', axis=1)

    return merged_df

# Call the function to replace category names
updated_products_df = replace_category_names(myDB.dataframes['products'], myDB.dataframes['product_category_translation'])

# remove carriage returns from the product_category_name column
updated_products_df['product_category_name'] = updated_products_df['product_category_name'].str.replace('\r', '')

# replace the product dataframe in the dictionary with the updated dataframe
myDB.dataframes['products'] = updated_products_df




In [13]:
# Update the products csv file with the new dataframe

update_csv_files(get_dataframes(), directory)



customers.csv updated successfully.
geolocation.csv updated successfully.
orders.csv updated successfully.
order_items.csv updated successfully.
order_payments.csv updated successfully.
order_reviews.csv updated successfully.
products.csv updated successfully.
product_category_translation.csv updated successfully.
sellers.csv updated successfully.


In [14]:
# add a new column in geolocations table to store the full state name instead of the abbreviation
# get dictionary of state abbreviations and full names
def get_state_dict():
    return {
        'AC': 'Acre',
        'AL': 'Alagoas',
        'AP': 'Amapá',
        'AM': 'Amazonas',
        'BA': 'Bahia',
        'CE': 'Ceará',
        'DF': 'Distrito Federal',
        'ES': 'Espírito Santo',
        'GO': 'Goiás',
        'MA': 'Maranhão',
        'MT': 'Mato Grosso',
        'MS': 'Mato Grosso do Sul',
        'MG': 'Minas Gerais',
        'PA': 'Pará',
        'PB': 'Paraíba',
        'PR': 'Paraná',
        'PE': 'Pernambuco',
        'PI': 'Piauí',
        'RJ': 'Rio de Janeiro',
        'RN': 'Rio Grande do Norte',
        'RS': 'Rio Grande do Sul',
        'RO': 'Rondônia',
        'RR': 'Roraima',
        'SC': 'Santa Catarina',
        'SP': 'São Paulo',
        'SE': 'Sergipe',
        'TO': 'Tocantins'
    }
    
# Create a new column in the geolocation dataframe to store the full state name
def add_state_name(geolocation_df):
    # Get the state dictionary
    state_dict = get_state_dict()

    # Create a new column in the dataframe to store the full state name
    geolocation_df['state_name'] = geolocation_df['state'].map(state_dict)

    return geolocation_df

# Call the function to add the state name column
updated_geolocation_df = add_state_name(myDB.dataframes['geolocation'])

# update the csv file with the new dataframe
myDB.dataframes['geolocation'] = updated_geolocation_df

update_csv_files(get_dataframes(), directory)



customers.csv updated successfully.
geolocation.csv updated successfully.
orders.csv updated successfully.
order_items.csv updated successfully.
order_payments.csv updated successfully.
order_reviews.csv updated successfully.
products.csv updated successfully.
product_category_translation.csv updated successfully.
sellers.csv updated successfully.


In [15]:
# add a new column to the orders table to see the time taken between order purchase timestamp and order delivered
def add_delivery_time(orders_df):
    # Convert the purchase and delivery timestamps to datetime
    orders_df['order_purchase_timestamp'] = pd.to_datetime(orders_df['order_purchase_timestamp'])
    orders_df['order_delivered_customer_date'] = pd.to_datetime(orders_df['order_delivered_customer_date'])

    # Calculate the time taken to deliver the order
    orders_df['delivery_time'] = orders_df['order_delivered_customer_date'] - orders_df['order_purchase_timestamp']

    # Convert the time taken to days
    orders_df['delivery_time'] = orders_df['delivery_time'].dt.days

    return orders_df

# Call the function to add the delivery time column
updated_orders_df = add_delivery_time(myDB.dataframes['orders'])


# update the csv file with the new dataframe
myDB.dataframes['orders'] = updated_orders_df


# count null values in in delivery_time column
myDB.dataframes['orders']['delivery_time'].isnull().sum()
# remove rows with null values in delivery_time column
myDB.dataframes['orders'] = myDB.dataframes['orders'][myDB.dataframes['orders']['delivery_time'].notna()]
# make delivery_time column an integer
myDB.dataframes['orders']['delivery_time'] = myDB.dataframes['orders']['delivery_time'].astype(int)
# update the csv file with the new dataframe
update_csv_files(get_dataframes(), directory)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myDB.dataframes['orders']['delivery_time'] = myDB.dataframes['orders']['delivery_time'].astype(int)


customers.csv updated successfully.
geolocation.csv updated successfully.
orders.csv updated successfully.
order_items.csv updated successfully.
order_payments.csv updated successfully.
order_reviews.csv updated successfully.
products.csv updated successfully.
product_category_translation.csv updated successfully.
sellers.csv updated successfully.
