In [5]:
# Step 1: Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setting visual style
sns.set(style="whitegrid")


In [6]:
# Step 2: Load the dataset
file_path = '../Data/retail_sales_dataset.csv'  # Make sure file path is correct
df = pd.read_csv(file_path)

# Quick look at first 5 rows
print("Initial Dataset Preview:")
df.head()


Initial Dataset Preview:


Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [7]:
# Step 3: Check dataset information
print("\nDataset Info:")
df.info()

# Check for missing values
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB

Missing Values Before Cleaning:
Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64


In [8]:
# Step 4: Remove rows with missing values (if any)
df.dropna(inplace=True)

print("\nMissing Values After Cleaning:")
print(df.isnull().sum())



Missing Values After Cleaning:
Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64


In [9]:
# Step 5: Check and remove duplicate rows
print("\nDuplicate Rows Before Cleaning:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicate Rows After Cleaning:", df.duplicated().sum())



Duplicate Rows Before Cleaning: 0
Duplicate Rows After Cleaning: 0


In [10]:
# Step 6: Visualizations

# 1. Sales Distribution
if 'SALES' in df.columns:
    plt.figure(figsize=(10,6))
    df['SALES'].hist(bins=30, color='skyblue', edgecolor='black')
    plt.title('Sales Distribution')
    plt.xlabel('Sales Amount')
    plt.ylabel('Frequency')
    plt.show()

# 2. Order Status Count
if 'STATUS' in df.columns:
    plt.figure(figsize=(10,6))
    sns.countplot(data=df, x='STATUS', palette='Set2')
    plt.title('Order Status Distribution')
    plt.xticks(rotation=45)
    plt.show()


In [11]:
# Step 7: Save cleaned data
df.to_csv('../Data/cleaned_retail_sales_data.csv', index=False)
print("\n✅ Cleaned data saved successfully at '../Data/cleaned_retail_sales_data.csv'")



✅ Cleaned data saved successfully at '../Data/cleaned_retail_sales_data.csv'
