<a href="https://colab.research.google.com/github/poojya100/6thSem-ML-Lab/blob/main/%E2%80%9C1BM23CS303_Lab_1_DataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer

df = pd.read_csv("Dataset of Diabetes  (1).csv")

print("Initial Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)

# 3. Data Cleaning
# ---- 3.1 Handling Missing Values ----
print("\nMissing values per column:\n", df.isnull().sum())

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# ---- 3.2 Handling Categorical Data ----
print("\nCategorical Columns:", list(cat_cols))

# Encode categorical variables
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['CLASS'] = df['CLASS'].map({'N': 0, 'P': 1, 'Y': 2})

# ---- 3.3 Handling Outliers using IQR ----
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

for col in num_cols:
    df = remove_outliers_iqr(df, col)

print("\nDataset shape after outlier removal:", df.shape)

# 4. Data Transformation

# ---- 4.1 Min-Max Scaling ----
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[num_cols] = minmax_scaler.fit_transform(df_minmax[num_cols])

# ---- 4.2 Standard Scaling ----
standard_scaler = StandardScaler()
df_standard = df.copy()
df_standard[num_cols] = standard_scaler.fit_transform(df_standard[num_cols])




Initial Dataset Shape: (1000, 14)

Data Types:
 ID             int64
No_Pation      int64
Gender        object
AGE            int64
Urea         float64
Cr             int64
HbA1c        float64
Chol         float64
TG           float64
HDL          float64
LDL          float64
VLDL         float64
BMI          float64
CLASS         object
dtype: object

Missing values per column:
 ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64

Categorical Columns: ['Gender', 'CLASS']

Dataset shape after outlier removal: (629, 14)
