In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/sales_data.csv")

# ---------------------------
# 1. Check initial info
# ---------------------------
print("Initial Shape:", df.shape)
print("\nINFO:\n",df.info())
print("\nHEAD():\n", df.head(10))


Initial Shape: (1000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Product_ID            1000 non-null   int64  
 1   Sale_Date             1000 non-null   object 
 2   Sales_Rep             1000 non-null   object 
 3   Region                1000 non-null   object 
 4   Sales_Amount          1000 non-null   float64
 5   Quantity_Sold         1000 non-null   int64  
 6   Product_Category      1000 non-null   object 
 7   Unit_Cost             1000 non-null   float64
 8   Unit_Price            1000 non-null   float64
 9   Customer_Type         1000 non-null   object 
 10  Discount              1000 non-null   float64
 11  Payment_Method        1000 non-null   object 
 12  Sales_Channel         1000 non-null   object 
 13  Region_and_Sales_Rep  1000 non-null   object 
dtypes: float64(4), int64(2), object(8)
memory usage

In [5]:
# ---------------------------
# 2. Handle missing values
# ---------------------------
# Check missing values
print(df.isnull().sum())


Product_ID              0
Sale_Date               0
Sales_Rep               0
Region                  0
Sales_Amount            0
Quantity_Sold           0
Product_Category        0
Unit_Cost               0
Unit_Price              0
Customer_Type           0
Discount                0
Payment_Method          0
Sales_Channel           0
Region_and_Sales_Rep    0
dtype: int64


In [6]:
# Example: Fill missing numeric values with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Example: Fill missing categorical values with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [7]:
# ---------------------------
# 3. Remove duplicate rows
# ---------------------------
df.drop_duplicates(inplace=True)

In [8]:
# ---------------------------
# 4. Standardize text values
# ---------------------------
# Example: Standardizing region names
df['Region'] = df['Region'].str.strip().str.title()

# Example: Standardizing Customer_Type
df['Customer_Type'] = df['Customer_Type'].str.lower().replace({
    'new': 'new',
    'returning': 'returning'
})

In [9]:
# ---------------------------
# 5. Convert date format
# ---------------------------
df['Sale_Date'] = pd.to_datetime(df['Sale_Date'], errors='coerce')
df['Sale_Date'] = df['Sale_Date'].dt.strftime('%d-%m-%Y')

In [10]:
# ---------------------------
# 6. Rename column headers
# ---------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [11]:
# ---------------------------
# 7. Fix data types
# ---------------------------
df['quantity_sold'] = df['quantity_sold'].astype(int)
df['sales_amount'] = df['sales_amount'].astype(float)

In [12]:
# ---------------------------
# Save cleaned dataset
# ---------------------------
df.to_csv("sales_data_cleaned.csv", index=False)

print("Final Shape:", df.shape)
print("Data Cleaning Done. Cleaned file saved as 'sales_data_cleaned.csv'")


Final Shape: (1000, 14)
Data Cleaning Done. Cleaned file saved as 'sales_data_cleaned.csv'


In [15]:
# Load the cleaned_dataset
df_cl = pd.read_csv("/content/sales_data_cleaned.csv")
print(df_cl.head())

   product_id   sale_date sales_rep region  sales_amount  quantity_sold  \
0        1052  03-02-2023       Bob  North       5053.97             18   
1        1093  21-04-2023       Bob   West       4384.02             17   
2        1015  21-09-2023     David  South       4631.23             30   
3        1072  24-08-2023       Bob  South       2167.94             39   
4        1061  24-03-2023   Charlie   East       3750.20             13   

  product_category  unit_cost  unit_price customer_type  discount  \
0        Furniture     152.75      267.22     returning      0.09   
1        Furniture    3816.39     4209.44     returning      0.11   
2             Food     261.56      371.40     returning      0.20   
3         Clothing    4330.03     4467.75           new      0.02   
4      Electronics     637.37      692.71           new      0.08   

  payment_method sales_channel region_and_sales_rep  
0           Cash        Online            North-Bob  
1           Cash        Re