In [4]:
import pandas as pd

df = pd.read_csv("/content/Sample - Superstore.csv", encoding='ISO-8859-1')

df.drop(columns=["Row ID", "Customer ID", "Product ID"], inplace=True)

df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Ship Date"] = pd.to_datetime(df["Ship Date"])

print("Missing values before cleaning:\n", df.isnull().sum())
df.drop_duplicates(inplace=True)

df["Postal Code"] = df["Postal Code"].astype(str)

df["Shipping Duration"] = (df["Ship Date"] - df["Order Date"]).dt.days

Q1 = df[["Sales", "Profit"]].quantile(0.25)
Q3 = df[["Sales", "Profit"]].quantile(0.75)
IQR = Q3 - Q1

df = df[((df[["Sales", "Profit"]] < (Q1 - 1.5 * IQR)) |
          (df[["Sales", "Profit"]] > (Q3 + 1.5 * IQR))).any(axis=1)]


print("\nFinal Shape:", df.shape)
print("\nMissing values after cleaning:\n", df.isnull().sum())



Missing values before cleaning:
 Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64
Cleaned data saved as 'Cleaned_Superstore.csv'

Final Shape: (2120, 19)

Missing values after cleaning:
 Order ID             0
Order Date           0
Ship Date            0
Ship Mode            0
Customer Name        0
Segment              0
Country              0
City                 0
State                0
Postal Code          0
Region               0
Category             0
Sub-Category         0
Product Name         0
Sales                0
Quantity             0
Discount             0
Profit               0
Shipping Duration    0
dtype: int64


In [6]:
df.columns
print("First 5 records:\n", df.head())
print("Last 5 records:\n", df.tail())

First 5 records:
           Order ID Order Date  Ship Date       Ship Mode    Customer Name  \
1   CA-2016-152156 2016-11-08 2016-11-11    Second Class      Claire Gute   
3   US-2015-108966 2015-10-11 2015-10-18  Standard Class   Sean O'Donnell   
7   CA-2014-115812 2014-06-09 2014-06-14  Standard Class  Brosina Hoffman   
10  CA-2014-115812 2014-06-09 2014-06-14  Standard Class  Brosina Hoffman   
11  CA-2014-115812 2014-06-09 2014-06-14  Standard Class  Brosina Hoffman   

     Segment        Country             City       State Postal Code Region  \
1   Consumer  United States        Henderson    Kentucky       42420  South   
3   Consumer  United States  Fort Lauderdale     Florida       33311  South   
7   Consumer  United States      Los Angeles  California       90032   West   
10  Consumer  United States      Los Angeles  California       90032   West   
11  Consumer  United States      Los Angeles  California       90032   West   

      Category Sub-Category  \
1    Furnitur

In [15]:
df.to_csv("Cleaned_Superstore.csv", index=False)
print("Cleaned data saved as 'Cleaned_Superstore.csv'")

Cleaned data saved as 'Cleaned_Superstore.csv'


In [17]:
df.describe()

Unnamed: 0,Order Date,Ship Date,Sales,Quantity,Discount,Profit,Shipping Duration
count,2120,2120,2120.0,2120.0,2120.0,2120.0,2120.0
mean,2016-04-24 09:48:54.339622400,2016-04-28 07:37:07.924528384,820.894463,4.828774,0.186491,93.60563,3.908491
min,2014-01-04 00:00:00,2014-01-08 00:00:00,18.32,1.0,0.0,-6599.978,0.0
25%,2015-05-03 18:00:00,2015-05-08 00:00:00,296.1415,3.0,0.0,-48.82665,3.0
50%,2016-06-10 12:00:00,2016-06-14 00:00:00,544.38,4.0,0.2,84.51,4.0
75%,2017-05-09 12:00:00,2017-05-12 06:00:00,905.345,6.0,0.3,160.6353,5.0
max,2017-12-29 00:00:00,2018-01-04 00:00:00,22638.48,14.0,0.8,8399.976,7.0
std,,,1165.361479,2.494434,0.226976,502.245712,1.757912
