In [24]:
import pandas as pd

# Load raw data
df = pd.read_csv(r"F:\retail_raw_dataset (1).csv")

# 1. Remove duplicate rows
df.drop_duplicates(inplace=True)

# 2. Drop rows with critical missing values early
df.dropna(subset=['Full Name', 'Sales ($)', 'Email'], inplace=True)

# 3. Standardize text values
df['Gender'] = df['Gender'].str.strip().str.lower().replace({
    'm': 'Male', 'f': 'Female', 'male': 'Male', 'female': 'Female'
})
df['Country'] = df['Country'].str.strip().str.title()
df['Product Category'] = df['Product Category'].str.strip().str.title()

# 4. Convert date format to dd-mm-yyyy
df['Purchase Date'] = pd.to_datetime(df['Purchase Date'], errors='coerce', dayfirst=True).dt.strftime('%d-%m-%Y')

# 5. Rename column headers
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# 6. Fix data types
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['sales_($)'] = pd.to_numeric(df['sales_($)'], errors='coerce')

# 7. Remove any remaining nulls to match strict task guidelines
df.dropna(inplace=True)

# Save cleaned file (optional)
df.to_csv("F:/retail_cleaned_dataset.csv", index=False)

# Show first 10 rows
df.head(100)


Unnamed: 0,customer_id,full_name,gender,country,purchase_date,product_category,age,sales_($),email
0,C1000,Allison Hill,Male,India,25-10-2024,Clothing,24.0,902.96,rhodespatricia@example.org
1,C1001,Amanda Davis,Male,United States,29-10-2024,Clothing,19.0,744.42,hoffmanjennifer@example.net
5,C1005,Jamie Arnold,Male,United States,04-06-2025,Electronics,47.0,428.35,barbara10@example.net
8,C1008,Joel Nelson,Female,United Kingdom,09-07-2025,Clothing,59.0,338.39,gabriellecameron@example.org
13,C1013,Brittany Moore,Female,United Kingdom,11-04-2025,Home Decor,51.0,639.05,cruzcaitlin@example.com
14,C1014,Tricia Valencia,Male,India,09-08-2024,Electronics,54.0,177.09,maldonadoamanda@example.com
15,C1015,Gina Carter,Male,Usa,01-10-2024,Clothing,53.0,574.91,mckaynancy@example.com
19,C1019,John Jones,Male,Singapore,10-05-2025,Furniture,30.0,693.08,davidalvarez@example.net
23,C1023,David Caldwell,Female,Singapore,22-10-2024,Furniture,28.0,986.78,jenniferkhan@example.net
26,C1026,Connor West,Male,United States,06-01-2025,Electronics,33.0,947.44,steven17@example.net
