In [1]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
    
    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.







In [2]:
import pandas as pd

# 1. Load the Dataset
try:
    df = pd.read_csv('sales_data.csv')
    print("Dataset loaded successfully!")

    # 2. Calculate Q1, Q3, and IQR for Monthly_Sales
    Q1 = df['Monthly_Sales'].quantile(0.25)
    Q3 = df['Monthly_Sales'].quantile(0.75)
    IQR = Q3 - Q1
    print(f"\nQ1: {Q1}")
    print(f"Q3: {Q3}")
    print(f"IQR: {IQR}")

    # 3. Identify Outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df['Monthly_Sales'] < lower_bound) | (df['Monthly_Sales'] > upper_bound)]
    print(f"\nNumber of outliers found: {len(outliers)}")
    print("\nOutliers:")
    print(outliers)

    # 4. Remove the Outliers
    df_cleaned = df[~((df['Monthly_Sales'] < lower_bound) | (df['Monthly_Sales'] > upper_bound))]

    # 5. Check the number of rows removed
    rows_removed = len(df) - len(df_cleaned)
    print(f"\nNumber of rows in the original dataset: {len(df)}")
    print(f"Number of rows in the dataset after removing outliers: {len(df_cleaned)}")
    print(f"Number of outliers (rows) removed: {rows_removed}")

except FileNotFoundError:
    print("Error: sales_data.csv not found. Please make sure the file is in the correct directory.")

Error: sales_data.csv not found. Please make sure the file is in the correct directory.
