# **DATA CLEANING**

In [116]:
import numpy as np
import pandas as pd

In [117]:
df = pd.read_csv('Ecommerce.csv')

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115609 entries, 0 to 115608
Data columns (total 40 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   customer_id                    115609 non-null  object 
 1   order_id                       115609 non-null  object 
 2   order_item_id                  115609 non-null  int64  
 3   customer_unique_id             115609 non-null  object 
 4   customer_state                 115609 non-null  object 
 5   customer_city                  115609 non-null  object 
 6   product_id                     115609 non-null  object 
 7   product_category_name          115609 non-null  object 
 8   price                          115609 non-null  float64
 9   product_name_lenght            115609 non-null  float64
 10  product_description_lenght     115609 non-null  float64
 11  product_photos_qty             115609 non-null  float64
 12  product_weight_g              

In [119]:
# drop unnecessary columns
df.drop(columns=["customer_id", "product_category_name", "review_id", "review_comment_title", "review_comment_message"
,"review_creation_date", "review_answer_timestamp", "payment_installments", "shipping_limit_date", "seller_zip_code_prefix"
,"customer_zip_code_prefix", "product_name_lenght", "product_description_lenght", "product_photos_qty", "seller_city"
, "customer_city", "order_approved_at", "order_delivered_carrier_date", "order_estimated_delivery_date"],inplace=True)

In [120]:
# rename columns
df.rename(columns={"customer_unique_id" : "customer_id",
"freight value" : "delivery_charge",
"product_category_name_english" : "category",
"review_score" : "rating",
"order_purchase_timestamp" : "order_placed",
"order_delivered_customer_date" : "order_delivered"},inplace=True)

In [121]:
#check
df.columns

Index(['order_id', 'order_item_id', 'customer_id', 'customer_state',
       'product_id', 'price', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'category', 'seller_id',
       'seller_state', 'rating', 'order_placed', 'order_delivered',
       'order_status', 'payment_type', 'payment_value', 'payment_sequential',
       'freight_value'],
      dtype='object')

In [122]:
# merge multiple product rows of a single order into one row
order_payment = (df.groupby('order_id', as_index=False)
              .agg({
                  'price': 'sum',               # sum of item prices
                  'payment_value': 'first',     # order-level value; same for all rows
                  'payment_type': lambda x: x.iloc[0] if len(set(x)) == 1 else 'multiple',
                  'payment_sequential': 'max'
              }))

In [123]:
# Remove duplicate order_id entries and left merge with order_payment to add payment details
df_final = df.drop_duplicates("order_id") \
             .merge(order_payment, on="order_id", how="left")

In [124]:
#check
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              96516 non-null  object 
 1   order_item_id         96516 non-null  int64  
 2   customer_id           96516 non-null  object 
 3   customer_state        96516 non-null  object 
 4   product_id            96516 non-null  object 
 5   price_x               96516 non-null  float64
 6   product_weight_g      96515 non-null  float64
 7   product_length_cm     96515 non-null  float64
 8   product_height_cm     96515 non-null  float64
 9   product_width_cm      96515 non-null  float64
 10  category              96516 non-null  object 
 11  seller_id             96516 non-null  object 
 12  seller_state          96516 non-null  object 
 13  rating                96516 non-null  int64  
 14  order_placed          96516 non-null  object 
 15  order_delivered    

In [125]:
# take only max product number available in order_item_id for single order
# and rename the column as items_in_order
items_per_order = (
    df.groupby("order_id")
      .agg(items_in_order=("order_item_id", "max"))
      .reset_index()
)

In [126]:
# merge it with original data
df_final = df_final.merge(
    items_per_order,
    on="order_id",
    how="left"
)

In [127]:
#check
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              96516 non-null  object 
 1   order_item_id         96516 non-null  int64  
 2   customer_id           96516 non-null  object 
 3   customer_state        96516 non-null  object 
 4   product_id            96516 non-null  object 
 5   price_x               96516 non-null  float64
 6   product_weight_g      96515 non-null  float64
 7   product_length_cm     96515 non-null  float64
 8   product_height_cm     96515 non-null  float64
 9   product_width_cm      96515 non-null  float64
 10  category              96516 non-null  object 
 11  seller_id             96516 non-null  object 
 12  seller_state          96516 non-null  object 
 13  rating                96516 non-null  int64  
 14  order_placed          96516 non-null  object 
 15  order_delivered    

In [128]:
#check columns
df_final.columns

Index(['order_id', 'order_item_id', 'customer_id', 'customer_state',
       'product_id', 'price_x', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'category', 'seller_id',
       'seller_state', 'rating', 'order_placed', 'order_delivered',
       'order_status', 'payment_type_x', 'payment_value_x',
       'payment_sequential_x', 'freight_value', 'price_y', 'payment_value_y',
       'payment_type_y', 'payment_sequential_y', 'items_in_order'],
      dtype='object')

In [129]:
# drop extra columns
df_final.drop(columns=["payment_type_x","payment_sequential_x", 'payment_value_x','price_x','order_item_id'],inplace=True)

In [130]:
# ranme columns
df_final.rename(columns={"payment_type_y" : "payment_type",
                         "price_y":"price", "payment_value_y":"payment_value",
                         "payment_type_y":"payment_type"},inplace=True)

In [131]:
#check
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              96516 non-null  object 
 1   customer_id           96516 non-null  object 
 2   customer_state        96516 non-null  object 
 3   product_id            96516 non-null  object 
 4   product_weight_g      96515 non-null  float64
 5   product_length_cm     96515 non-null  float64
 6   product_height_cm     96515 non-null  float64
 7   product_width_cm      96515 non-null  float64
 8   category              96516 non-null  object 
 9   seller_id             96516 non-null  object 
 10  seller_state          96516 non-null  object 
 11  rating                96516 non-null  int64  
 12  order_placed          96516 non-null  object 
 13  order_delivered       94486 non-null  object 
 14  order_status          96516 non-null  object 
 15  freight_value      

In [132]:
#change datatype
df_final["order_placed"] = pd.to_datetime(df_final["order_placed"])
df_final["order_delivered"] = pd.to_datetime(df_final["order_delivered"])

In [133]:
# find delivery time period
df_final["delivered_days"] = (df_final["order_delivered"]-df_final["order_placed"]).dt.days

In [134]:
#check
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   order_id              96516 non-null  object        
 1   customer_id           96516 non-null  object        
 2   customer_state        96516 non-null  object        
 3   product_id            96516 non-null  object        
 4   product_weight_g      96515 non-null  float64       
 5   product_length_cm     96515 non-null  float64       
 6   product_height_cm     96515 non-null  float64       
 7   product_width_cm      96515 non-null  float64       
 8   category              96516 non-null  object        
 9   seller_id             96516 non-null  object        
 10  seller_state          96516 non-null  object        
 11  rating                96516 non-null  int64         
 12  order_placed          96516 non-null  datetime64[ns]
 13  order_delivered 

In [135]:
# find the product density via using product length,height,width,weight columns
df_final["product_density"] = df_final["product_weight_g"]/(df_final["product_length_cm"]*df_final["product_width_cm"]*df_final["product_height_cm"])

In [136]:
#drop extra columns
df_final.drop(columns=['order_placed', 'order_delivered','product_weight_g', 'product_length_cm', 'product_height_cm','product_width_cm'],inplace=True)

In [137]:
# merge  seller states into 'other' if they have lower values
state_counts = df_final["seller_state"].value_counts()
small_states = state_counts[state_counts < 1000].index

df_final["seller_state"] = df_final["seller_state"].replace(small_states, "other")

In [138]:
# merge  customer states into 'other' if they have lower values
state_counts = df_final["customer_state"].value_counts()
small_states = state_counts[state_counts < 1000].index

df_final["customer_state"] = df_final["customer_state"].replace(small_states, "other")

In [139]:
#check
df_final["seller_state"].value_counts()

Unnamed: 0_level_0,count
seller_state,Unnamed: 1_level_1
SP,68404
MG,7622
PR,7480
RJ,4199
SC,3577
other,3315
RS,1919


In [140]:
# categorized the categories
df_final['category'] = df_final['category'].replace({
    'housewares': 'home',
    'furniture_decor': 'home',
    'furniture_bedroom': 'home',
    'furniture_living_room': 'home',
    'furniture_mattress_and_upholstery': 'home',
    'office_furniture': 'home',
    'bed_bath_table': 'home',
    'la_cuisine': 'home',
    'home_confort': 'home',
    'home_comfort_2': 'home',
    'home_appliances': 'home',
    'home_appliances_2': 'home',
    'small_appliances': 'home',
    'small_appliances_home_oven_and_coffee': 'home',
    'kitchen_dining_laundry_garden_furniture': 'home',
    'construction_tools_construction': 'Construction',
    'construction_tools_lights': 'Construction',
    'construction_tools_safety': 'Construction',
    'costruction_tools_garden': 'Construction',
    'costruction_tools_tools': 'Construction',
    'home_construction': 'Construction',
    'garden_tools': 'Construction',
    'electronics': 'Electronics',
    'computers': 'Electronics',
    'computers_accessories': 'Electronics',
    'tablets_printing_image': 'Electronics',
    'telephony': 'Electronics',
    'fixed_telephony': 'Electronics',
    'air_conditioning': 'Electronics',
    'audio': 'Electronics',
    'cine_photo': 'Electronics',
    'consoles_games': 'Electronics',
    'fashion_male_clothing': 'Fashion',
    'fashio_female_clothing': 'Fashion',
    'fashion_childrens_clothes': 'Fashion',
    'fashion_underwear_beach': 'Fashion',
    'fashion_shoes': 'Fashion',
    'fashion_sport': 'Fashion',
    'fashion_bags_accessories': 'Fashion',
    'watches_gifts': 'Fashion',
    'health_beauty': 'Health',
    'perfumery': 'Health',
    'baby': 'Health',
    'diapers_and_hygiene': 'Health',
    'food': 'Food_Drink',
    'food_drink': 'Food_Drink',
    'drinks': 'Food_Drink',
    'books_general_interest': 'Book_Media',
    'books_technical': 'Book_Media',
    'books_imported': 'Book_Media',
    'cds_dvds_musicals': 'Book_Media',
    'dvds_blu_ray': 'Book_Media',
    'music': 'Book_Media',
    'toys': 'Toys',
    'sports_leisure': 'Toys',
    'cool_stuff': 'Toys',
    'party_supplies': 'Toys',
    'agro_industry_and_commerce': 'Industry',
    'industry_commerce_and_business': 'Industry',
    'market_place': 'Industry',
    'security_and_services': 'Industry',
    'signaling_and_security': 'Industry',
    'art': 'Art',
    'arts_and_craftmanship': 'Art',
    'flowers': 'Art',
    'christmas_supplies': 'Art',
    'musical_instruments': 'Art',
    'pet_shop': 'Pet',
    'auto': 'Automotive'
})


In [141]:
# merge different status into these - delivered, processing, canceled
df_final["order_status"] = df_final["order_status"].replace({
    'shipped': 'processing',
    'invoiced': 'processing',
    'unavailable': 'canceled',
    'approved': 'processing'
})

In [142]:
# merge boleto payment type into voucher payment type
df_final["payment_type"] = df_final["payment_type"].replace({
    'boleto' : 'voucher'
})

In [143]:
#check
df_final["payment_type"].value_counts()

Unnamed: 0_level_0,count
payment_type,Unnamed: 1_level_1
credit_card,72156
voucher,20704
multiple,2172
debit_card,1484


In [144]:
#check
df_final.columns

Index(['order_id', 'customer_id', 'customer_state', 'product_id', 'category',
       'seller_id', 'seller_state', 'rating', 'order_status', 'freight_value',
       'price', 'payment_value', 'payment_type', 'payment_sequential_y',
       'items_in_order', 'delivered_days', 'product_density'],
      dtype='object')

In [145]:
#rename columns
df_final.rename(columns={'freight_value':'delivery_charge','total_payment_value':'total_payment','total_transactions':'total_installment',
                         'delivered_days':'delivery_time','payment_sequential_y' : 'total_installment',
                         'payment_value':'total_payment'},inplace=True)

In [146]:
#check
df_final.columns

Index(['order_id', 'customer_id', 'customer_state', 'product_id', 'category',
       'seller_id', 'seller_state', 'rating', 'order_status',
       'delivery_charge', 'price', 'total_payment', 'payment_type',
       'total_installment', 'items_in_order', 'delivery_time',
       'product_density'],
      dtype='object')

In [147]:
# change the sequence of columns
new_order = [
    'order_id',
    'items_in_order',
    'customer_id',
    'customer_state',
    'product_id',
    'product_density',
    'price',
    'category',
    'seller_id',
    'seller_state',
    'rating',
    'delivery_time',
    'delivery_charge',
    'payment_type',
    'total_payment',
    'total_installment',
    'order_status'
]

df_final = df_final[new_order]


In [148]:
# check
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   order_id           96516 non-null  object 
 1   items_in_order     96516 non-null  int64  
 2   customer_id        96516 non-null  object 
 3   customer_state     96516 non-null  object 
 4   product_id         96516 non-null  object 
 5   product_density    96515 non-null  float64
 6   price              96516 non-null  float64
 7   category           96516 non-null  object 
 8   seller_id          96516 non-null  object 
 9   seller_state       96516 non-null  object 
 10  rating             96516 non-null  int64  
 11  delivery_time      94486 non-null  float64
 12  delivery_charge    96516 non-null  float64
 13  payment_type       96516 non-null  object 
 14  total_payment      96516 non-null  float64
 15  total_installment  96516 non-null  int64  
 16  order_status       965

In [149]:
# check total installment = 0
df_final[df_final['total_installment'] == 0]

Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status


In [150]:
#  TOTAL PAYMENT
df_final['total_payment'].describe()

#check for 0
df_final[df_final['total_payment'] == 0]


Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status


In [151]:
# delivery charge = 0
df_final[df_final['delivery_charge'] == 0 ]



Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status
190,6d7de866a140b19d09e825b2a4e944c7,2,51cbfa44126505de7a55fb99ba49648b,SC,aca2eb7d00ea1a7b8ebd4e68314663af,0.173333,139.8,home,955fee9216a65b617aa5c0531780ce60,SP,2,7.0,0.0,credit_card,139.8,1,delivered
403,d62387acb3914789e712406e9882e549,1,1f7d9afd77ceacfa0b5b461a50818806,RJ,53b36df67ebb7c41585e8d54d6772e08,0.255245,99.9,Fashion,7d13fca15225358621be4086e1eb0964,SP,4,5.0,0.0,credit_card,99.9,1,delivered
626,f29d0fd6d4e6d5ce550e0b2f9335116c,1,c1dc1162de49817a25314db3030276f5,RS,aca2eb7d00ea1a7b8ebd4e68314663af,0.173333,69.9,home,955fee9216a65b617aa5c0531780ce60,SP,5,22.0,0.0,voucher,69.9,1,delivered
644,a2ea081c06c12c25c068089e4985745a,1,826303f32007dc49ba0fbf597caf80ec,CE,53b36df67ebb7c41585e8d54d6772e08,0.255245,110.0,Fashion,7d13fca15225358621be4086e1eb0964,SP,4,20.0,0.0,credit_card,110.0,1,delivered
843,f7fd9088963ee4f442fd642219d80fb5,1,9fd78e7df3488fd8e24ccc627acc38ef,PR,aca2eb7d00ea1a7b8ebd4e68314663af,0.173333,69.9,home,955fee9216a65b617aa5c0531780ce60,SP,5,7.0,0.0,credit_card,69.9,1,delivered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95462,f937ec3a717fd77710c5fb85cf5321d9,1,6148c0fb32e5c7352da8656b9b34ec60,RJ,53b36df67ebb7c41585e8d54d6772e08,0.255245,99.9,Fashion,7d13fca15225358621be4086e1eb0964,SP,5,5.0,0.0,credit_card,99.9,1,delivered
95624,577b745a85120419fe27073e3c9e1a76,1,e21bea57967ffc7bdc8b07558e09685e,RJ,7a10781637204d8d10485c71a6108a2e,0.097436,199.0,Fashion,4869f7a5dfa277a7dca6462dcf3b52b2,SP,5,10.0,0.0,debit_card,199.0,1,delivered
95842,b811c8ca07716586e80c0bc4f2a6e39f,1,a9747b164275fabdd901a9fc169c5251,RJ,aca2eb7d00ea1a7b8ebd4e68314663af,0.173333,69.9,home,955fee9216a65b617aa5c0531780ce60,SP,5,10.0,0.0,credit_card,69.9,1,delivered
95888,f565be42bf5390f6cd98934cc4c17091,1,f3a6febe6e4236817f26caf440beea4e,SP,53b36df67ebb7c41585e8d54d6772e08,0.255245,99.9,Fashion,7d13fca15225358621be4086e1eb0964,SP,4,16.0,0.0,credit_card,99.9,1,delivered


In [152]:
# find where total payment not equal to delivery charge plus price of the product
df_final[df_final['total_payment'] != (df_final['price'] + df_final['delivery_charge'])]

# we can say these are dues values or may be a condition where customer didn't pay the price


Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status
0,e481f51cbdc54678b7cc49136f2d6af7,1,7c396fd4830fd04220f754e42b4e5bff,SP,87285b34884572647811a353c7ac498a,0.253036,89.97,home,3504c0cb71d7fa48d967e0e4c94d59d9,SP,4,8.0,8.72,multiple,18.12,3,delivered
4,ad21c59c0840e6cb83a9ceb5573f8159,1,72632f0f9dd73dfee390c9b22eb56dd6,SP,65266b2da20d04dbe00c5c2d3bb7859e,0.021786,19.90,stationery,2c9e548be18521d1c43cde1c582c6de8,SP,5,2.0,8.72,credit_card,28.62,1,delivered
8,e69bfb5eb88e0ed6a785585b27e16dbf,1,299905e3934e9e181bfb2e164dd4b4f8,SP,9a78fb9862b10749a117f7fc3c31f051,0.134810,299.98,home,7c67e1448b00f6e969d365cea6b010ab,SP,5,18.0,19.77,multiple,161.42,2,delivered
9,e6ce16cb79ec1d90b1da9085a6118aeb,2,f2a85dec752b8517b5e58a06ff3cd937,RJ,08574b074924071f4e201e151b152b4e,0.457875,198.00,Construction,001cca7ae9ae17fb1caed9dfb1094831,other,1,12.0,30.53,credit_card,259.06,1,delivered
11,82566a660a982b15fb86e904c8d32918,1,e97109680b052ee858d93a539597bba7,MG,72a97c271b2e429974398f46b93ae530,0.103401,31.90,Health,094ced053e257ae8cae57205592d6712,SP,5,12.0,18.23,voucher,50.13,1,delivered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96502,19dba6d1e58b04e7fc820558a38ea398,1,9e4accf95024aa6565ca1efcadb96752,SP,0a57f7d2c983bcf8188589a5fea4a8da,0.142857,96.00,Fashion,4869f7a5dfa277a7dca6462dcf3b52b2,SP,5,6.0,8.79,credit_card,104.79,1,delivered
96507,cfa78b997e329a5295b4ee6972c02979,1,a49e8e11e850592fe685ae3c64b40eca,PR,3d2c44374ee42b3003a470f3e937a2ea,0.260417,55.90,Art,ce248b21cb2adc36282ede306b7660e5,SC,1,37.0,15.14,credit_card,71.04,1,delivered
96508,9115830be804184b91f5c00f6f49f92d,2,c716cf2b5b86fb24257cffe9e7969df8,other,c982dbea53b864f4d27c1d36f14b6053,0.026042,138.02,Toys,1caf283236cd69af44cbc09a0a1e7d32,RJ,5,16.0,0.80,multiple,42.42,2,delivered
96509,aa04ef5214580b06b10e2a378300db44,1,e03dbdf5e56c96b106d8115ac336f47f,MG,9fc063fd34fed29ccc57b7f8e8d03388,0.065789,740.00,Health,ccc4bbb5f32a6ab2b7066a4130f114e3,PR,5,11.0,19.43,multiple,250.00,2,delivered


In [153]:
#check null value
df_final.isnull().sum()

Unnamed: 0,0
order_id,0
items_in_order,0
customer_id,0
customer_state,0
product_id,0
product_density,1
price,0
category,0
seller_id,0
seller_state,0


In [154]:
# check where is missing
df_final[df_final['delivery_time'].isnull()].groupby('order_status')['order_id'].count()

Unnamed: 0_level_0,order_id
order_status,Unnamed: 1_level_1
canceled,428
delivered,8
processing,1594


In [155]:
# replace with zero
df_final['delivery_time'] = df_final['delivery_time'].fillna(0).astype(int)

In [156]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96516 entries, 0 to 96515
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   order_id           96516 non-null  object 
 1   items_in_order     96516 non-null  int64  
 2   customer_id        96516 non-null  object 
 3   customer_state     96516 non-null  object 
 4   product_id         96516 non-null  object 
 5   product_density    96515 non-null  float64
 6   price              96516 non-null  float64
 7   category           96516 non-null  object 
 8   seller_id          96516 non-null  object 
 9   seller_state       96516 non-null  object 
 10  rating             96516 non-null  int64  
 11  delivery_time      96516 non-null  int64  
 12  delivery_charge    96516 non-null  float64
 13  payment_type       96516 non-null  object 
 14  total_payment      96516 non-null  float64
 15  total_installment  96516 non-null  int64  
 16  order_status       965

In [157]:
# 1 missing value
df_final[df_final['product_density'].isnull()]

Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status
37252,85f8ad45e067abd694b627859fa57453,1,69b2b29382af03e03e486ad307a76000,RS,09ff539a621711667c43eba6a3bd8466,,1934.0,Health,8b8cfc8305aa441e4239358c9f6f2485,PR,5,8,27.0,credit_card,1961.0,1,delivered


In [158]:
# replace with avg of same delivery charge values
df_final['product_density'] = df_final['product_density'].fillna(0.147434)

In [159]:
#check
df_final.isnull().sum()

Unnamed: 0,0
order_id,0
items_in_order,0
customer_id,0
customer_state,0
product_id,0
product_density,0
price,0
category,0
seller_id,0
seller_state,0


In [160]:
# Scale product_density by 100 and round to 2 decimal places
df_final['product_density'] = (df_final['product_density'] * 100).round(2)

In [164]:
#check
df_final.sample(10)

Unnamed: 0,order_id,items_in_order,customer_id,customer_state,product_id,product_density,price,category,seller_id,seller_state,rating,delivery_time,delivery_charge,payment_type,total_payment,total_installment,order_status
75355,26e2db0bd73689ff814692e3dc232671,2,70274c85eb170f583f0ae42e19494c6b,SP,a7216be504e1ff211b67479ab193a552,16.75,139.8,home,a2e874074c877c5a05abae80ad6e488f,SP,5,4,21.13,credit_card,182.06,1,delivered
31556,ec8cbb2f067bf3af1cdf3c82ce77e1c9,1,d9058ef529d2c65e426d6d84ff11e838,SP,36f60d45225e60c7da4558b070ce4b60,7.07,89.5,Electronics,8e6d7754bc7e0f22c96d255ebda59eba,SP,5,3,10.04,credit_card,99.54,1,delivered
10014,68709f86fb9eba39721082ba8fcb7047,1,037576137eb995f89b3abc6c690a3d24,other,2748fba16a62f912134f7991548c3cee,7.41,99.5,luggage_accessories,1da3aeb70d7989d1e6d9b0e887f97c23,SP,4,8,59.85,credit_card,159.35,1,delivered
61521,14d45fea1c20e4fd5c31dde983344115,1,b5653aa179c080658d4c6aaa891838aa,SP,937d4dbe97dc4cda39683025976f91ae,6.05,29.99,home,8a32e327fe2c1b3511609d81aaf9f042,SP,1,12,10.96,credit_card,40.95,1,delivered
4293,a0be4f22d143bef8121980995eaf15c9,1,fc8dc6d77c65df005e0e44bd7e1f5d36,RS,04df667e73ca2fab4b9a97c86466b699,7.72,579.0,Fashion,b33e7c55446eabf8fe1a42d037ac7d6d,SP,5,19,19.95,credit_card,598.95,1,delivered
87926,93d1d708e3c14a6565de485c5f62b516,1,8d8d4c24208d333841e7808542a0f257,RJ,7dac8901a50cf3317d5372cbd3186ecd,85.23,29.0,Fashion,6560211a19b47992c3666cc44a7e94c0,SP,1,9,15.3,voucher,44.3,1,delivered
9497,a83bbd7210fc35296b93c16a48be49b8,1,1f158aaee60e88bec053b05d2224cda4,RJ,a10e0fcb1c409869c3c6da4eb13b7612,16.72,50.0,Health,855668e0971d4dfd7bef1b6a4133b41b,SP,5,6,16.6,credit_card,66.6,1,delivered
54436,f3a5594ec1dbbdb286b45516754c5655,1,6c1eea6e359f7344fa11a0a066c0dbca,SP,1782400950423c9b12600278b8ef65d3,10.27,419.9,Health,fe2032dab1a61af8794248c8196565c9,SP,5,2,11.75,credit_card,431.65,1,delivered
12749,fb8d6e92e3a08fe6485d4fb08d23ae0e,1,daf8f32d76049be93d30afc42078e6e4,SP,cadb69af336c16aba5c5223599821245,7.14,13.65,Electronics,8b321bb669392f5163d04c59e235e066,SP,4,5,7.39,credit_card,21.04,1,delivered
91449,9b441adcb57c2e78cb12352908df3ea8,1,cfa0f7234243fb3dc77d1392c7b28579,SP,39377e2d331b8af5a19bc565215e3a8c,39.68,623.0,Food_Drink,8444e55c1f13cd5c179851e5ca5ebd00,MG,4,16,20.84,credit_card,643.84,1,delivered


In [166]:
#save cleaned data
df_final.to_csv('final_data.csv', index=False)