In [4]:
import pandas as pd

df = pd.read_csv("../data/sales_data.csv")
print(df)

   order_id region   product  quantity  price
0      1001   East  Keyboard       2.0   1500
1      1002   West     Mouse       5.0    500
2      1003   East   Monitor       NaN  12000
3      1004  South  Keyboard       1.0   1500
4      1005   West   Monitor       2.0  12000


In [5]:
print("Original dataset:")
print(df)
print(f"\nShape: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()}")

Original dataset:
   order_id region   product  quantity  price
0      1001   East  Keyboard       2.0   1500
1      1002   West     Mouse       5.0    500
2      1003   East   Monitor       NaN  12000
3      1004  South  Keyboard       1.0   1500
4      1005   West   Monitor       2.0  12000

Shape: (5, 5)

Missing values:
order_id    0
region      0
product     0
quantity    1
price       0
dtype: int64


In [10]:
Filtered_df = df.loc[(df["region"] == "West") & (df["quantity"] > 1)]
print(Filtered_df)

   order_id region  product  quantity  price
1      1002   West    Mouse       5.0    500
4      1005   West  Monitor       2.0  12000


In [11]:
df[df["region"] == "West"]
df.query("region == 'West' and quantity > 1")

Unnamed: 0,order_id,region,product,quantity,price
1,1002,West,Mouse,5.0,500
4,1005,West,Monitor,2.0,12000


In [12]:
print("Missing values before cleaning")
df.isnull().sum()

Missing values before cleaning


order_id    0
region      0
product     0
quantity    1
price       0
dtype: int64

In [13]:
clean_df = df.dropna(subset = ["quantity"])

In [14]:
print("Missing values after cleaning:")
print(clean_df.isnull().sum())
print(f"\nRows before: {len(df)}, Rows after: {len(clean_df)}")

Missing values after cleaning:
order_id    0
region      0
product     0
quantity    0
price       0
dtype: int64

Rows before: 5, Rows after: 4


In [15]:
print("Duplicate rows:")
print(clean_df[clean_df.duplicated()])
print(f"\nTotal duplicates: {clean_df.duplicated().sum()}")

Duplicate rows:
Empty DataFrame
Columns: [order_id, region, product, quantity, price]
Index: []

Total duplicates: 0


In [16]:
clean_df = clean_df.drop_duplicates()

In [17]:
print(f"Rows after removing duplicates: {len(clean_df)}")

Rows after removing duplicates: 4


In [18]:
clean_df.to_csv("clean_sales_data.csv",index = False)

In [19]:
print("Clean dataset saved to: clean_sales_data.csv")
print(f"Final shape: {clean_df.shape}")

Clean dataset saved to: clean_sales_data.csv
Final shape: (4, 5)


In [21]:
def clean_sales_data(input_file, output_file):
    """
    Clean sales data by filtering, handling missing values, and removing duplicates.
    """
    # Load data
    df = pd.read_csv(input_file)
    
    # Filter: region == "West" and quantity > 1
    filtered_df = df.loc[(df["region"] == "West") & (df["quantity"] > 1)]
    
    # Handle missing values: drop rows with missing quantity
    clean_df = filtered_df.dropna(subset=["quantity"])
    
    # Remove duplicates
    clean_df = clean_df.drop_duplicates()
    
    # Save cleaned data
    clean_df.to_csv(output_file, index=False)
    
    print(f"Cleaning complete!")
    print(f"Original rows: {len(df)}")
    print(f"Cleaned rows: {len(clean_df)}")
    print(f"Output saved to: {output_file}")
    
    return clean_df

clean_df = clean_sales_data("../data/sales_data.csv", "../output/clean_sales_data.csv")

Cleaning complete!
Original rows: 5
Cleaned rows: 2
Output saved to: ../output/clean_sales_data.csv
