In [1]:
# -*- coding: utf-8 -*-
"""Lab-1-ML-DataPreprocessing.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1H7zMwzkuIZJvJEZFGEUB7RlUVVOPzC5M
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

def createdata():
  data = {
      'Age': np.random.randint(18, 70, size=20),
      'Salary': np.random.randint(30000, 120000, size=20),
      'Purchased': np.random.choice([0, 1], size=20),
      'Gender': np.random.choice(['Male', 'Female'], size=20),
      'City': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)
  }

  df = pd.DataFrame(data)
  return df

df = createdata()
df.head(10)

df.shape

# Introduce some missing values for demonstration
df.loc[5, 'Age'] = np.nan
df.loc[10, 'Salary'] = np.nan
df.head(10)

# Basic information about the dataset
print(df.info())

# Summary statistics
print(df.describe())

#Code to Find Missing Values
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])

#Set the values to some value (zero, the mean, the median, etc.).
# Step 1: Create an instance of SimpleImputer with the median strategy for Age and mean stratergy for Salary
imputer1 = SimpleImputer(strategy="median")
imputer2 = SimpleImputer(strategy="mean")

df_copy=df

# Step 2: Fit the imputer on the "Age" and "Salary"column
# Note: SimpleImputer expects a 2D array, so we reshape the column
imputer1.fit(df_copy[["Age"]])
imputer2.fit(df_copy[["Salary"]])

# Step 3: Transform (fill) the missing values in the "Age" and "Salary"c column
df_copy["Age"] = imputer1.transform(df[["Age"]])
df_copy["Salary"] = imputer2.transform(df[["Salary"]])

# Verify that there are no missing values left
print(df_copy["Age"].isnull().sum())
print(df_copy["Salary"].isnull().sum())

#Handling Categorical Attributes
#Using Ordinal Encoding for gender COlumn and One-Hot Encoding for City Column

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[["Male", "Female"]])
# Fit and transform the data
df_copy["Gender_Encoded"] = ordinal_encoder.fit_transform(df_copy[["Gender"]])

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit and transform the "City" column
encoded_data = onehot_encoder.fit_transform(df[["City"]])

# Convert the sparse matrix to a dense array
encoded_array = encoded_data.toarray()

# Convert to DataFrame for better visualization
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(["City"]))
df_encoded = pd.concat([df_copy, encoded_df], axis=1)

df_encoded.drop("Gender", axis=1, inplace=True)
df_encoded.drop("City", axis=1, inplace=True)

print(df_encoded. head())

#Data Transformation
# Min-Max Scaler/Normalization (range 0-1)
#Pros: Keeps all data between 0 and 1; ideal for distance-based models.
#Cons: Can distort data distribution, especially with extreme outliers.
normalizer = MinMaxScaler()
df_encoded[['Salary']] = normalizer.fit_transform(df_encoded[['Salary']])
df_encoded.head()

# Standardization (mean=0, variance=1)
#Pros: Works well for normally distributed data; suitable for many models.
#Cons: Sensitive to outliers.
scaler = StandardScaler()
df_encoded[['Age']] = scaler.fit_transform(df_encoded[['Age']])
df_encoded.head()

#Removing Outliers
# Outlier Detection and Treatment using IQR
#Pros: Simple and effective for mild outliers.
#Cons: May overly reduce variation if there are many extreme outliers.
df_encoded_copy1=df_encoded
df_encoded_copy2=df_encoded
df_encoded_copy3=df_encoded

Q1 = df_encoded_copy1['Salary'].quantile(0.25)
Q3 = df_encoded_copy1['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_encoded_copy1['Salary'] = np.where(df_encoded_copy1['Salary'] > upper_bound, upper_bound,
                        np.where(df_encoded_copy1['Salary'] < lower_bound, lower_bound, df_encoded_copy1['Salary']))

print(df_encoded_copy1.head())

#Removing Outliers
# Z-score method
#Pros: Good for normally distributed data.
#Cons: Not suitable for non-normal data; may miss outliers in skewed distributions.

df_encoded_copy2['Salary_zscore'] = stats.zscore(df_encoded_copy2['Salary'])
df_encoded_copy2['Salary'] = np.where(df_encoded_copy2['Salary_zscore'].abs() > 3, np.nan, df_encoded_copy2['Salary'])  # Replace outliers with NaN
print(df_encoded_copy2.head())

#Removing Outliers
# Median replacement for outliers
#Pros: Keeps distribution shape intact, useful when capping isnâ€™t feasible.
#Cons: May distort data if outliers represent real phenomena.
df_encoded_copy3['Salary_zscore'] = stats.zscore(df_encoded_copy3['Salary'])
median_salary = df_encoded_copy3['Salary'].median()
df_encoded_copy3['Salary'] = np.where(df_encoded_copy3['Salary_zscore'].abs() > 3, median_salary, df_encoded_copy3['Salary'])
print(df_encoded_copy3.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        19 non-null     float64
 1   Salary     19 non-null     float64
 2   Purchased  20 non-null     int64  
 3   Gender     20 non-null     object 
 4   City       20 non-null     object 
dtypes: float64(2), int64(1), object(2)
memory usage: 932.0+ bytes
None
             Age         Salary  Purchased
count  19.000000      19.000000  20.000000
mean   46.052632   76251.789474   0.550000
std    16.510851   23696.042667   0.510418
min    19.000000   37746.000000   0.000000
25%    31.500000   58586.500000   0.000000
50%    54.000000   79537.000000   1.000000
75%    58.000000   96932.500000   1.000000
max    69.000000  110757.000000   1.000000
Age       1
Salary    1
dtype: int64
0
0
    Age    Salary  Purchased  Gender_Encoded  City_Los Angeles  City_New York  \
0  57.0  100880.0          1         

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

def preprocess_data(file_path):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: '{file_path}' not found.")
        return None

    # 1. Data Cleaning
    # 1.1 Handling Missing Values (using SimpleImputer)
    numerical_cols = df.select_dtypes(include=np.number).columns
    categorical_cols = df.select_dtypes(exclude=np.number).columns
    print(df.isnull().sum())
    imputer_num = SimpleImputer(strategy='median')  # Use median for numerical
    df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

    imputer_cat = SimpleImputer(strategy='most_frequent') # Use most frequent for categorical
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])


    # 1.2 Handling Categorical Data (using OneHotEncoding)
    for col in categorical_cols:
        if df[col].nunique() <= 10 : # Apply one-hot only to columns with less than or equal to 10 unique values
          onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
          encoded_data = onehot_encoder.fit_transform(df[[col]])
          encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out([col]))
          df = pd.concat([df, encoded_df], axis=1).drop(columns=col)
        else :
          ordinal_encoder = OrdinalEncoder()
          df[col] = ordinal_encoder.fit_transform(df[[col]])


    # 1.3 Handling Outliers (using Z-score method)
    for col in numerical_cols:
        df[col + '_zscore'] = np.abs(stats.zscore(df[col]))
        df[col] = np.where(df[col + '_zscore'] > 3, df[col].median(), df[col])
        df = df.drop(col + '_zscore', axis=1)


    # 2. Data Transformations
    # 2.1 Min-Max Scaling
    min_max_scaler = MinMaxScaler()
    df[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])


    # 2.2 Standard Scaling
    standard_scaler = StandardScaler()
    df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

    return df


# Preprocess the Diabetes dataset
diabetes_df = preprocess_data('/content/Dataset of Diabetes .csv')

if diabetes_df is not None:
    print("Preprocessed Diabetes Data:")
    print(diabetes_df.head())


# Preprocess the Adult Income dataset
adult_df = preprocess_data('/content/adult.csv')

if adult_df is not None:
    print("\nPreprocessed Adult Income Data:")
    print(adult_df.head())


ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64
Preprocessed Diabetes Data:
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.672140  -0.157106 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
1  1.641852  -0.124010  0.156936 -0.175155 -0.050153 -1.341068 -0.978321   
2  0.330868  -0.095991 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
3  1.412950  -0.015153 -0.424738 -0.067182 -0.677551 -1.341068 -0.537604   
4  0.680463  -0.124006 -2.402429  1.228490 -0.677551 -1.341068  0.079400   

         TG       HDL       LDL      VLDL       BMI  Gender_F  Gender_M  \
0 -1.185936  3.266253 -1.158715 -0.497302 -1.136091       1.0       0.0   
1 -0.744645 -0.119680 -0.465934 -0.435617 -1.341671       0.0       1.0   
2 -1.185936  3.266253 -1.158715 -0.497302 -1.136091       1.0      

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Function to handle missing values, categorical data, and outliers
def data_cleaning(df):
    # Handling missing values
    # Impute missing values with the median for numerical columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    imputer = SimpleImputer(strategy='median')
    df[num_cols] = imputer.fit_transform(df[num_cols])

    # Impute missing values with the most frequent value for categorical columns
    cat_cols = df.select_dtypes(include=[object]).columns
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

    # Handling outliers using Z-Score method
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[num_cols]))
    df = df[(z_scores < 3).all(axis=1)]  # Removing rows with Z-score greater than 3

    # Handling categorical data using one-hot encoding
    df = pd.get_dummies(df, drop_first=True)

    return df

# Function to apply Min-Max Scaler and Standard Scaler
def data_transformations(df):
    # Min-Max Scaling
    min_max_scaler = MinMaxScaler()
    df_scaled_minmax = df.copy()
    df_scaled_minmax[df.columns] = min_max_scaler.fit_transform(df)

    # Standard Scaling
    standard_scaler = StandardScaler()
    df_scaled_standard = df.copy()
    df_scaled_standard[df.columns] = standard_scaler.fit_transform(df)

    return df_scaled_minmax, df_scaled_standard

# Load the Diabetes dataset
def load_diabetes_data():
    # Assuming the dataset is stored as 'diabetes.csv'
    df = pd.read_csv('/content/Dataset of Diabetes .csv')
    df_cleaned = data_cleaning(df)
    df_minmax, df_standard = data_transformations(df_cleaned)
    return df_cleaned, df_minmax, df_standard

# Load the Adult Income dataset
def load_adult_income_data():
    # Assuming the dataset is stored as 'adult.csv'
    df = pd.read_csv('/content/adult.csv')
    df_cleaned = data_cleaning(df)
    df_minmax, df_standard = data_transformations(df_cleaned)
    return df_cleaned, df_minmax, df_standard

# Example: Preprocessing Diabetes Dataset
df_diabetes_cleaned, df_diabetes_minmax, df_diabetes_standard = load_diabetes_data()
print(f"Diabetes Data after Cleaning and Transformation (Min-Max Scaled):\n{df_diabetes_minmax.head()}")

# Example: Preprocessing Adult Income Dataset
df_adult_cleaned, df_adult_minmax, df_adult_standard = load_adult_income_data()
print(f"Adult Income Data after Cleaning and Transformation (Min-Max Scaled):\n{df_adult_minmax.head()}")


Diabetes Data after Cleaning and Transformation (Min-Max Scaled):
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.626566   0.002032  0.431373  0.328125  0.203046  0.283688  0.405405   
2  0.523810   0.005447  0.431373  0.328125  0.203046  0.283688  0.405405   
3  0.849624   0.009963  0.431373  0.328125  0.203046  0.283688  0.405405   
4  0.629073   0.003881  0.098039  0.515625  0.203046  0.283688  0.500000   
5  0.791980   0.003881  0.333333  0.140625  0.091371  0.219858  0.229730   

         TG       HDL       LDL      VLDL       BMI  Gender_M  Gender_f  \
0  0.100000  0.758621  0.207547  0.031746  0.238095       0.0       0.0   
2  0.100000  0.758621  0.207547  0.031746  0.238095       0.0       0.0   
3  0.100000  0.758621  0.207547  0.031746  0.238095       0.0       0.0   
4  0.116667  0.206897  0.320755  0.023810  0.095238       1.0       0.0   
5  0.116667  0.275862  0.226415  0.023810  0.095238       0.0       0.0   

   CLASS_N   CLASS_P  CLAS