In [1]:
import pandas as pd

file_path = "../data/raw/amazon_sales_2025_INR.csv"
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,Order_ID,Date,Customer_ID,Product_Category,Product_Name,Quantity,Unit_Price_INR,Total_Sales_INR,Payment_Method,Delivery_Status,Review_Rating,Review_Text,State,Country
0,ORD100000,2025-01-25,CUST2796,Home & Kitchen,Cookware Set,2,25574.41,51148.82,Credit Card,Returned,1,Waste of money,Sikkim,India
1,ORD100001,2025-08-28,CUST9669,Beauty,Hair Dryer,1,19361.41,19361.41,Debit Card,Returned,5,Excellent product!,Telangana,India
2,ORD100002,2025-02-27,CUST5808,Electronics,Tablet,3,38476.22,115428.66,Cash on Delivery,Delivered,3,Fair deal,Nagaland,India
3,ORD100003,2025-02-24,CUST5889,Electronics,Headphones,5,38145.72,190728.6,Credit Card,Delivered,5,Highly recommend!,Assam,India
4,ORD100004,2025-06-15,CUST9005,Clothing,Saree,5,45940.98,229704.9,UPI,Delivered,5,Highly recommend!,Odisha,India


#### Some basic inspections

In [None]:
# Shape of the data
df.shape

# Column names
df.columns

# Info about data types and nulls
df.info()

# Quick numeric summary
df.describe()

# Quick look at categories
df['Product_Category'].value_counts().head(10)

df['Payment_Method'].value_counts()

df['Delivery_Status'].value_counts()

df['State'].nunique(), df['State'].unique()[:10]

df['Review_Rating'].value_counts().sort_index()


#### Convert Date column to datetime

In [3]:
df['Date'] = pd.to_datetime(df['Date'])


In [4]:
# verify

df['Date'].dtype


dtype('<M8[ns]')

#### Create time-based columns

In [5]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Month_Name'] = df['Date'].dt.month_name()
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.day_name()
df['Week_Number'] = df['Date'].dt.isocalendar().week


#### Validate revenue calculation

In [9]:
(df['Quantity'] * df['Unit_Price_INR'] - df['Total_Sales_INR']).abs().sum()


np.float64(0.0)

In [None]:
# If output is 0, the dataset is consistent.
# If not, create a corrected version:

df['Total_Sales_INR'] = df['Quantity'] * df['Unit_Price_INR']


#### Clean categorical columns

In [None]:
# Remove accidental whitespace + normalize case

cat_cols = ["Product_Category", "Product_Name", "Payment_Method",
            "Delivery_Status", "State", "Country"]

for col in cat_cols:
    df[col] = df[col].astype(str).str.strip().str.title()


#### Check duplicates

In [10]:
df.duplicated().sum()
df['Order_ID'].duplicated().sum()


np.int64(0)

In [None]:
# If duplicates exist:

df = df.drop_duplicates()

##### Final checks

In [11]:
df.isnull().sum()
df.describe()
df['Review_Rating'].value_counts().sort_index()
df['Delivery_Status'].value_counts()
df['Payment_Method'].value_counts()


Payment_Method
Cash on Delivery    3827
Credit Card         3800
Debit Card          3727
UPI                 3646
Name: count, dtype: int64

#### Save cleaned dataset

In [12]:
df.to_csv("../data/processed/amazon_sales_2025_cleaned.csv", index=False)
