In [1]:
import pandas as pd 
# Load dataset (adjust encoding if needed) 
file_path = "Sample - Superstore.csv" 
df = pd.read_csv(file_path, encoding="ISO-8859-1")

In [2]:
# 1. Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

Missing values before cleaning:
 Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64


In [3]:
# Fill or drop missing values (depending on column) 
df['Postal Code'] = df['Postal Code'].fillna(0) # Replace missing Postal Code with 0 
df = df.dropna(subset=['Order ID', 'Customer ID']) # Drop rows with missing critical IDs

In [4]:
# 2. Remove duplicates
df = df.drop_duplicates()

In [5]:
# 3. Convert dates to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce') 
df['Ship Date'] = pd.to_datetime(df['Ship Date'], errors='coerce')

In [6]:
# 4. Standardize text columns
text_cols = df.select_dtypes(include='object').columns
for col in text_cols:
    df[col] = df[col].str.strip() # Remove leading/trailing spaces

In [None]:
# 5. Drop unnecessary columns (optional)
# 'Row ID' is just an index, so we can drop it df = df.drop(columns=['Row ID'])
df = df.drop(columns=['Row ID'])

In [8]:
# 6. Reset index
df = df.reset_index(drop=True)

In [9]:
# Final check # -------------------------------
print("Cleaned dataset info:")
print(df.info())
print("Shape:", df.shape)
print("Missing values after cleaning:\n", df.isnull().sum())

Cleaned dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Order ID       9994 non-null   object        
 1   Order Date     9994 non-null   datetime64[ns]
 2   Ship Date      9994 non-null   datetime64[ns]
 3   Ship Mode      9994 non-null   object        
 4   Customer ID    9994 non-null   object        
 5   Customer Name  9994 non-null   object        
 6   Segment        9994 non-null   object        
 7   Country        9994 non-null   object        
 8   City           9994 non-null   object        
 9   State          9994 non-null   object        
 10  Postal Code    9994 non-null   int64         
 11  Region         9994 non-null   object        
 12  Product ID     9994 non-null   object        
 13  Category       9994 non-null   object        
 14  Sub-Category   9994 non-null   object        
 15 

In [10]:
# Save cleaned dataset
df.to_csv("Superstore_Cleaned.csv", index=False)
print("Cleaned dataset saved as Superstore_Cleaned.csv")

Cleaned dataset saved as Superstore_Cleaned.csv
