In [2]:
import pandas as pd
import mysql.connector
import os

# List of CSV files and their corresponding table names
csv_files = [
    ('customers.csv', 'customers'),
    ('orders.csv', 'orders'),
    ('sellers.csv', 'sellers'),
    ('products.csv', 'products'),
    ('geolocation.csv', 'geolocation'),
    ('payments.csv', 'payments'),
    ('order_items.csv', 'order_items') # Added payments.csv for specific handling
]

# Connect to the MySQL database
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='shivam@123',
    database='ecommerce'
)
cursor = conn.cursor()

# Folder containing the CSV files
folder_path = 'C:/Users/ASUS/OneDrive/Desktop/Ecommerce'

def get_sql_type(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return 'INT'
    elif pd.api.types.is_float_dtype(dtype):
        return 'FLOAT'
    elif pd.api.types.is_bool_dtype(dtype):
        return 'BOOLEAN'
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return 'DATETIME'
    else:
        return 'TEXT'

for csv_file, table_name in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Replace NaN with None to handle SQL NULL
    df = df.where(pd.notnull(df), None)
    
    # Debugging: Check for NaN values
    print(f"Processing {csv_file}")
    print(f"NaN values before replacement:\n{df.isnull().sum()}\n")

    # Clean column names
    df.columns = [col.replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns]

    # Generate the CREATE TABLE statement with appropriate data types
    columns = ', '.join([f'`{col}` {get_sql_type(df[col].dtype)}' for col in df.columns])
    create_table_query = f'CREATE TABLE IF NOT EXISTS `{table_name}` ({columns})'
    cursor.execute(create_table_query)

    # Insert DataFrame data into the MySQL table
    for _, row in df.iterrows():
        # Convert row to tuple and handle NaN/None explicitly
        values = tuple(None if pd.isna(x) else x for x in row)
        sql = f"INSERT INTO `{table_name}` ({', '.join(['`' + col + '`' for col in df.columns])}) VALUES ({', '.join(['%s'] * len(row))})"
        cursor.execute(sql, values)

    # Commit the transaction for the current CSV file
    conn.commit()

# Close the connection
conn.close()

Processing customers.csv
NaN values before replacement:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

Processing orders.csv
NaN values before replacement:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

Processing sellers.csv
NaN values before replacement:
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

Processing products.csv
NaN values before replacement:
product_id                      0
product category              610
product_name_length           610
product_description_length    610
product_photos_qty            610
prod

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mysql.connector

db = mysql.connector.connect(
    host = "localhost",
    username= "root",
    password = "shivam@123",
    database = "ecommerce")

cur = db.cursor()

# List all unique cities where customers are located.

In [20]:
query = """select distinct customer_city from customers """

cur.execute(query)

data = cur.fetchall()

df = pd.DataFrame(data, columns = ["City"])
df

Unnamed: 0,City
0,franca
1,sao bernardo do campo
2,sao paulo
3,mogi das cruzes
4,campinas
...,...
4114,siriji
4115,natividade da serra
4116,monte bonito
4117,sao rafael


# Count the number of orders placed in 2017.

In [19]:
query = """ select count(order_id) from orders where year(order_purchase_timestamp) = 2017 """

cur.execute(query)
data = cur.fetchall()

"total orders placed in 2017 are", data[0][0]

('total orders placed in 2017 are', 45101)

# Find the total sales per category.

In [39]:
query = """ select upper(products.product_category) category, round(sum(payments.payment_value),2) sales 
from products join order_items on products.product_id = order_items.product_id
join payments on payments.order_id = order_items.order_id group by category """

cur.execute(query)

data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Category", "sales"])
df.head(10)

Unnamed: 0,Category,sales
0,PERFUMERY,506738.66
1,FURNITURE DECORATION,1430176.39
2,TELEPHONY,486882.05
3,BED TABLE BATH,1712553.67
4,AUTOMOTIVE,852294.33
5,COMPUTER ACCESSORIES,1585330.45
6,HOUSEWARES,1094758.13
7,BABIES,539845.66
8,TOYS,619037.69
9,FURNITURE OFFICE,646826.49


# Calculate the percentage of orders that were paid in installments.

In [28]:
query = """ select count(case when payment_installments >= 1 then 1 end) * 100/count(*)
as payment_in_installment from payments; """

cur.execute(query)
data = cur.fetchall()

"Percentage of orders that were paid in installment is", data[0][0]

('Percentage of orders that were paid in installment is', Decimal('99.9981'))

# Count the number of customers from each state.

In [38]:
query = """ select customer_state, count(customer_state) from customers group by customer_state """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["State", "Customer_count"])
df.head(10)

Unnamed: 0,State,Customer_count
0,SP,41746
1,SC,3637
2,MG,11635
3,PR,5045
4,RJ,12852
5,RS,5466
6,PA,975
7,GO,2020
8,ES,2033
9,BA,3380


# Calculate the number of orders per month in 2018.

In [36]:
query = """ select month(order_purchase_timestamp) as months, count(*) from orders
where year(order_purchase_timestamp) = 2018 group by month(order_purchase_timestamp) order by months; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Month", "total_orders"])
df

Unnamed: 0,Month,total_orders
0,1,7269
1,2,6728
2,3,7211
3,4,6939
4,5,6873
5,6,6167
6,7,6292
7,8,6512
8,9,16
9,10,4


# Retrieve the first 10 orders placed by customers in 2017.

In [37]:
query = """ select order_id, order_purchase_timestamp from orders where year(order_purchase_timestamp) = 2017
order by order_purchase_timestamp limit 10; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Order_id", "Date&Time"])
df

Unnamed: 0,Order_id,Date&Time
0,ec7a019261fce44180373d45b442d78f,2017-01-05 11:56:06
1,b95a0a8bd30aece4e94e81f0591249d8,2017-01-05 12:01:20
2,38bcb524e1c38c2c1b60600a80fc8999,2017-01-05 12:06:36
3,7a18a504c1a4b32d883e68de2e1a7db0,2017-01-05 12:09:08
4,6acecf438369055d9243e121045cca74,2017-01-05 12:11:23
5,34bf4feda1e203af64692d97c6950c39,2017-01-05 12:14:58
6,40599d3d28b75746952ded75566637b9,2017-01-05 13:01:48
7,ce86fa5a5108884726a2244bcae51ae6,2017-01-05 13:29:03
8,7004296aa0256632eaddc171edaf727f,2017-01-05 13:31:22
9,cce1b8a1c5f8b1d224e19628299c8f54,2017-01-05 13:33:45


# Find the top 5 cities with the most customers.

In [40]:
query = """ select customer_city, count(*) as customer_count from customers 
group by customer_city order by customer_count desc limit 5; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["City", "Customer_count"])
df

Unnamed: 0,City,Customer_count
0,sao paulo,15540
1,rio de janeiro,6882
2,belo horizonte,2773
3,brasilia,2131
4,curitiba,1521


# Count the number of orders for each order status.

In [42]:
query = """ select order_status, count(*) as order_count from orders
group by order_status; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Order_status", "Order_count"])
df

Unnamed: 0,Order_status,Order_count
0,delivered,96478
1,invoiced,314
2,shipped,1107
3,processing,301
4,unavailable,609
5,canceled,625
6,created,5
7,approved,2


# Find the highest order value (sum of price + freight_value) for a single order.


In [45]:
query = """ select max(price + freight_value) as order_value from order_items; """

cur.execute(query)
data = cur.fetchall()

"highest Order Value is", data[0][0]

('highest Order Value is', 6929.309997558594)

# Find the total revenue generated from each payment type.


In [49]:
query = """ select payment_type, sum(payment_value) as revenue from payments group by payment_type; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Payment_type", "Revenue"])
pd.set_option('display.float_format', '{:,.6f}'.format)
df

Unnamed: 0,Payment_type,Revenue
0,credit_card,12542084.190177
1,UPI,2869361.269494
2,voucher,379436.870396
3,debit_card,217989.789988
4,not_defined,0.0


# Find the total profit generated from each payment type.

In [50]:
query = """ select p.payment_type, sum(p.payment_value)-sum(o.price + o.freight_value) as profit
from payments as p inner join order_items as o on p.order_id = o.order_id 
group by p.payment_type; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Payment_type", "Profit"])
pd.set_option('display.float_format', '{:,.6f}'.format)
df

Unnamed: 0,Payment_type,Profit
0,credit_card,2865113.339049
1,voucher,-379208.910519
2,UPI,1217459.304107
3,debit_card,38227.119943


# Find the total number of products sold in each city.

In [53]:
query = """ select temp2.customer_city, count(p.product_id) as count from products as p
inner join (select temp.*, product_id from order_items as ot 
inner join (select c.*, o.order_id from customers as c 
inner join orders as o on c.customer_id = o.customer_id) as temp
on ot.order_id = temp.order_id) as temp2 on p.product_id = temp2.product_id
group by temp2.customer_city order by count desc; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Customer_city", "No_of_Product"])
df.head(10)

Unnamed: 0,Customer_city,No_of_Product
0,sao paulo,17808
1,rio de janeiro,7837
2,belo horizonte,3144
3,brasilia,2392
4,curitiba,1751
5,campinas,1654
6,porto alegre,1612
7,salvador,1412
8,guarulhos,1329
9,sao bernardo do campo,1060


# Find the customers who have not placed any orders.

In [55]:
query = """ select c.customer_id, o.order_id from customers as c 
left join orders as o on c.customer_id = o.customer_id
where o.order_id is NULL; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Customer_id", "Order_id"])
df

Unnamed: 0,Customer_id,Order_id
0,06b8999e3fba9a1fbc88172z30ba8ba7,
1,06b8999e4aba9a1cbc88172z50ba8ba7,
2,06b8999e3fba9f1fbc88172z00ba8ba7,
3,06b8999e2fba9a1fbc88172z00ba8ba7,
4,06b8999e2fba9a4fbc88172z60ba8ba7,


# Find the product category with the most products and display the product_category along with the count.

In [None]:
query = """ select product_category, count(*) as count from products
group by product_category order by count desc; """

cur.execute(query)
data = cur.fetchall()

df = pd.DataFrame(data, columns = ["Product_category", "Product_count"])
df.head(10)

# Find the 2nd highest payment value using subquery.

In [None]:
query = """ select  max(payment_value) from payments as p1 where payment_value
not in(select max(payment_value) from payments); """

cur.execute(query)
data = cur.fetchall()

"Second highest payment_value is:", data[0][0]