In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

adult_df = pd.read_csv("adult.csv")
diabetes_df = pd.read_csv("diabetes.csv")

adult_df.replace('?', np.nan, inplace=True)
print("\n--- Missing values in adult_df after '?' replacement ---")
print(adult_df.isnull().sum())

cols_with_zero_as_nan = ['Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
diabetes_df[cols_with_zero_as_nan] = diabetes_df[cols_with_zero_as_nan].replace(0, np.nan)
print("\n--- Missing values in diabetes_df after '0' replacement ---")
print(diabetes_df.isnull().sum())


--- Missing values in adult_df after '?' replacement ---
age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

--- Missing values in diabetes_df after '0' replacement ---
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         1
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


In [None]:
columns_to_impute_adult = ['workclass', 'occupation', 'native-country']
for col in columns_to_impute_adult:
    if adult_df[col].isnull().any():
        mode_value = adult_df[col].mode()[0]
        adult_df[col] = adult_df[col].fillna(mode_value)
        print(f"Mode imputed '{col}' in adult_df with: {mode_value}")

print("\n--- Missing values in adult_df after imputation ---")
print(adult_df.isnull().sum())

cols_to_impute_diabetes_mean = ['Urea', 'Cr', 'HbA1c', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
for col in cols_to_impute_diabetes_mean:
    if diabetes_df[col].isnull().any():
        mean_value = diabetes_df[col].mean()
        diabetes_df[col] = diabetes_df[col].fillna(mean_value)
        print(f"Mean imputed '{col}' in diabetes_df with: {mean_value:.2f}")
if diabetes_df['Chol'].isnull().any():
    mode_chol = diabetes_df['Chol'].mode()[0]
    diabetes_df['Chol'] = diabetes_df['Chol'].fillna(mode_chol)
    print(f"Mode imputed 'Chol' in diabetes_df with: {mode_chol}")

print("\n--- Missing values in diabetes_df after imputation ---")
print(diabetes_df.isnull().sum())


--- Missing values in adult_df after imputation ---
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

--- Missing values in diabetes_df after imputation ---
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


In [None]:

categorical_cols_adult = adult_df.select_dtypes(include='object').columns.tolist()
print("\n--- Categorical columns in adult_df ---")
print(categorical_cols_adult)

categorical_cols_diabetes = diabetes_df.select_dtypes(include='object').columns.tolist()
print("\n--- Categorical columns in diabetes_df ---")
print(categorical_cols_diabetes)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

gender_encoded_features = encoder.fit_transform(diabetes_df[['Gender']])

gender_df_encoded = pd.DataFrame(gender_encoded_features, columns=encoder.get_feature_names_out(['Gender']))
diabetes_df_encoded = pd.concat([diabetes_df.drop('Gender', axis=1), gender_df_encoded], axis=1)

print("\n--- diabetes_df after One-Hot Encoding 'Gender' (first 5 rows) ---")
print(diabetes_df_encoded.head())


--- Categorical columns in adult_df ---
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

--- Categorical columns in diabetes_df ---
['Gender', 'CLASS']

--- diabetes_df after One-Hot Encoding 'Gender' (first 5 rows) ---
    ID  No_Pation  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  \
0  502      17975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
1  735      34221   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0   
2  420      47975   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
3  680      87656   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0   
4  504      34223   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0   

  CLASS  Gender_F  Gender_M  Gender_f  
0     N       1.0       0.0       0.0  
1     N       0.0       1.0       0.0  
2     N       1.0       0.0       0.0  
3     N       1.0       0.0       0.0  
4     N       0.0       1.0       0.0  


In [None]:

selected_numerical_column = 'AGE'

scaler_minmax = MinMaxScaler()

diabetes_df_encoded[f'{selected_numerical_column}_MinMaxScaled'] = scaler_minmax.fit_transform(diabetes_df_encoded[[selected_numerical_column]])

print(f"\n--- Original '{selected_numerical_column}' vs. Min-Max Scaled (first 5 rows) ---")
print(diabetes_df_encoded[[selected_numerical_column, f'{selected_numerical_column}_MinMaxScaled']].head())

scaler_standard = StandardScaler()

diabetes_df_encoded[f'{selected_numerical_column}_Standardized'] = scaler_standard.fit_transform(diabetes_df_encoded[[selected_numerical_column]])

print(f"\n--- Original '{selected_numerical_column}' vs. Standardized (first 5 rows) ---")
print(diabetes_df_encoded[[selected_numerical_column, f'{selected_numerical_column}_Standardized']].head())

print("\n--- All requested data preprocessing steps completed. ---")


--- Original 'AGE' vs. Min-Max Scaled (first 5 rows) ---
   AGE  AGE_MinMaxScaled
0   50          0.508475
1   26          0.101695
2   50          0.508475
3   50          0.508475
4   33          0.220339

--- Original 'AGE' vs. Standardized (first 5 rows) ---
   AGE  AGE_Standardized
0   50         -0.401144
1   26         -3.130017
2   50         -0.401144
3   50         -0.401144
4   33         -2.334096

--- All requested data preprocessing steps completed. ---


In [None]:
print(diabetes_df.columns)

Index(['ID', 'No_Pation', 'Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG',
       'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS'],
      dtype='object')


In [None]:
cols_with_zero = ['Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']


diabetes_df[cols_with_zero] = diabetes_df[cols_with_zero].replace(0, None)
print(df1.isnull().sum())

diabetes_df[cols_with_zero] = diabetes_df[cols_with_zero].fillna(
    diabetes_df[cols_with_zero].mean()
)

print(diabetes_df.isnull().sum())

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64
