# Step 1 : Extract

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sales.csv')
print(df)

   id  product_id product_name   price country
0   1         101     Widget A  $10.50     USA
1   2         102     Widget B   $5.00  Canada
2   3         103     Widget A   10.50     USA


# Step 2: Clean the data
- Remove $ and quotes
- Convert price to float

In [3]:
df['price'] = (df['price']
               .astype(str)
               .str.replace("$","",regex=False)
               .str.replace('"',"", regex=False)
               .astype(float))


In [4]:
print(df)

   id  product_id product_name  price country
0   1         101     Widget A   10.5     USA
1   2         102     Widget B    5.0  Canada
2   3         103     Widget A   10.5     USA


# Step 3: Deduplicate (same Product + Price)

In [5]:
df = df.drop_duplicates(subset=['product_name','price'])

In [6]:
print(df)

   id  product_id product_name  price country
0   1         101     Widget A   10.5     USA
1   2         102     Widget B    5.0  Canada


# Step 4 : Convert USD -> INR

In [7]:
USD_TO_INR = 83
df['price_inr'] = df['price'] * USD_TO_INR

In [8]:
print(df)

   id  product_id product_name  price country  price_inr
0   1         101     Widget A   10.5     USA      871.5
1   2         102     Widget B    5.0  Canada      415.0


# Step 5 : Save the Clean data to clean_sales.json

In [9]:
df.to_json("clean_sales.json",orient='records',indent=4)
print("The file clean_sales.json created successfully")

The file clean_sales.json created successfully
