In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

# Sample dataset: Student Performance (focusing on "Marks")
data = {
    "Marks": [45, 60, 75, 80, 65, 50, 85, 90, 40, 95],
    "Age": [20, 21, 22, 23, 20, 21, 22, 23, 19, 24],
    "Study Hours": [2, 4, 6, 8, 5, 3, 7, 9, 1, 10]
}

df = pd.DataFrame(data)
print("Original Dataset:\n", df)
# Correctly introduce a missing value in the "Marks" column for demonstration
df.loc[2, 'Marks'] = np.nan

# Handle Missing Data: Fill missing values in "Marks" with the column's mean
df['Marks'] = df['Marks'].fillna(df['Marks'].mean())

# Remove Duplicates
df = df.drop_duplicates()

# Discretize "Marks" into 3 categories: Low, Medium, High
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df['Marks Discretized'] = kbins.fit_transform(df[['Marks']])

# Handle Outliers: Remove any marks outside of 1.5 * IQR
Q1 = df['Marks'].quantile(0.25)
Q3 = df['Marks'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['Marks'] >= (Q1 - 1.5 * IQR)) & (df['Marks'] <= (Q3 + 1.5 * IQR))]

# Remove noisy data: Consider marks lower than 40 as noise
df = df[df['Marks'] >= 40]

# Display the processed dataset
print("\nProcessed Dataset after Preprocessing:\n", df)


Original Dataset:
    Marks  Age  Study Hours
0     45   20            2
1     60   21            4
2     75   22            6
3     80   23            8
4     65   20            5
5     50   21            3
6     85   22            7
7     90   23            9
8     40   19            1
9     95   24           10

Processed Dataset after Preprocessing:
        Marks  Age  Study Hours  Marks Discretized
0  45.000000   20            2                0.0
1  60.000000   21            4                1.0
2  67.777778   22            6                1.0
3  80.000000   23            8                2.0
4  65.000000   20            5                1.0
5  50.000000   21            3                0.0
6  85.000000   22            7                2.0
7  90.000000   23            9                2.0
8  40.000000   19            1                0.0
9  95.000000   24           10                2.0
