# 1. Load the dataset into a Pandas DataFrame

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sales_data.csv')

# 2. Check for missing values in each column

In [4]:
print("Missing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Product_ID           0
Sales_Amount         1
Price                1
Customer_Location    1
Purchase_Date        1
Product_Category     0
Payment_Method       0
dtype: int64


# 3. Drop rows with critical missing values (e.g., Product_ID or Sales_Amount)

In [5]:

df.dropna(subset=['Product_ID', 'Sales_Amount'], inplace=True)


# 4. Fill missing values in Price with the average price of the same product category

In [6]:
df['Price'] = df.groupby('Product_Category')['Price'].transform(lambda x: x.fillna(x.mean()))


# 5. Use forward fill (ffill) and backward fill (bfill) for missing Customer_Location


In [7]:
df['Customer_Location'] = df['Customer_Location'].fillna(method='ffill').fillna(method='bfill')


  df['Customer_Location'] = df['Customer_Location'].fillna(method='ffill').fillna(method='bfill')


# 6. Fill missing Purchase_Date with the most frequent date

In [8]:
most_common_date = df['Purchase_Date'].mode()[0]
df['Purchase_Date'] = df['Purchase_Date'].fillna(most_common_date)


# 7. Convert Purchase_Date to datetime format

In [9]:
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], errors='coerce')


# 8. Standardize column names

In [10]:
df.rename(columns={
    'Product_ID': 'product_id',
    'Sales_Amount': 'sales_amount',
    'Price': 'price',
    'Customer_Location': 'customer_location',
    'Purchase_Date': 'purchase_date',
    'Product_Category': 'product_category',
    'Payment_Method': 'payment_method'
}, inplace=True)

# 9. Remove duplicate sales transactions

In [11]:
df.drop_duplicates(inplace=True)


# 10. Convert Product_ID to string

In [12]:
df['product_id'] = df['product_id'].astype(str)


# 11. Replace incorrect values in Payment_Method

In [13]:
df['payment_method'] = df['payment_method'].replace({
    'Csh': 'Cash',
    'cash': 'Cash',
    'creditcard': 'Credit Card',
    'Creditcard': 'Credit Card',
    'paypal': 'PayPal'
})

# 12. Sort data by Purchase_Date

In [14]:
df.sort_values(by='purchase_date', inplace=True)


# 13. Save the cleaned dataset

In [15]:
df.to_csv('cleaned_sales_data.csv', index=False)

print("✅ Data cleaning complete. Cleaned data saved to 'cleaned_sales_data.csv'")

✅ Data cleaning complete. Cleaned data saved to 'cleaned_sales_data.csv'
