In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [None]:
adult_df = pd.read_csv("adult.csv")
diabetes_df = pd.read_csv("DoD.csv")

In [None]:
print(adult_df.head())
print(diabetes_df.head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [None]:
adult_df.replace('?', np.nan, inplace=True)
diabetes_df.replace('?', np.nan, inplace=True)

In [None]:
print(adult_df.isnull().sum())
print(diabetes_df.isnull().sum())

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64
ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


In [None]:
for col in ["workclass", "occupation", "native-country"]:
    adult_df.fillna({col: adult_df[col].mode()[0]}, inplace=True)
print(adult_df.isnull().sum())

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [None]:
print(adult_df.dtypes)
print(diabetes_df.dtypes)

# Store integer columns for scaling
adult_num_cols = [col for col in adult_df.columns if adult_df[col].dtype != "object"]
diabetes_num_cols = [col for col in diabetes_df.columns if diabetes_df[col].dtype != "object"]
print(adult_num_cols)
print(diabetes_num_cols)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object
ID             int64
No_Pation      int64
Gender        object
AGE            int64
Urea         float64
Cr             int64
HbA1c        float64
Chol         float64
TG           float64
HDL          float64
LDL          float64
VLDL         float64
BMI          float64
CLASS         object
dtype: object
['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
['ID', 'No_Pation', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']


In [None]:
# Transformation

for col in adult_df.columns:
    if adult_df[col].dtype == "object":
      if col not in ["gender", "relationship", "income", "race"]:
        encoder = OneHotEncoder()
        data = encoder.fit_transform(adult_df[[col]]).toarray()
        matrix = pd.DataFrame(data, columns=encoder.get_feature_names_out([col]))
        adult_df = pd.concat([adult_df, matrix], axis=1)
        adult_df.drop(col, axis=1, inplace=True)
      else:
        encoder = OrdinalEncoder(categories=[list(adult_df[col].value_counts().keys())])
        adult_df[col] = encoder.fit_transform(adult_df[[col]])
        # adult_df.drop(col, axis=1, inplace=True)
print(adult_df.head())





   age  fnlwgt  educational-num  relationship  race  gender  capital-gain  \
0   25  226802                7           2.0   1.0     0.0             0   
1   38   89814                9           0.0   0.0     0.0             0   
2   28  336951               12           0.0   0.0     0.0             0   
3   44  160323               10           0.0   1.0     0.0          7688   
4   18  103497               10           2.0   0.0     1.0             0   

   capital-loss  hours-per-week  income  ...  native-country_Portugal  \
0             0              40     0.0  ...                      0.0   
1             0              50     0.0  ...                      0.0   
2             0              40     1.0  ...                      0.0   
3             0              40     1.0  ...                      0.0   
4             0              30     0.0  ...                      0.0   

   native-country_Puerto-Rico  native-country_Scotland  native-country_South  \
0                 

In [None]:
for col in diabetes_df.columns:
    if diabetes_df[col].dtype == "object":
      encoder = OrdinalEncoder(categories=[list(diabetes_df[col].value_counts().keys())])
      diabetes_df[col] = encoder.fit_transform(diabetes_df[[col]])
print(diabetes_df.head())

    ID  No_Pation  Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL  \
0  502      17975     1.0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
1  735      34221     0.0   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6   
2  420      47975     1.0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
3  680      87656     1.0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5   
4  504      34223     0.0   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4   

    BMI  CLASS  
0  24.0    1.0  
1  23.0    1.0  
2  24.0    1.0  
3  24.0    1.0  
4  21.0    1.0  


In [None]:
print(adult_df.dtypes)
print(diabetes_df.dtypes)

age                                 int64
fnlwgt                              int64
educational-num                     int64
relationship                      float64
race                              float64
                                   ...   
native-country_Thailand           float64
native-country_Trinadad&Tobago    float64
native-country_United-States      float64
native-country_Vietnam            float64
native-country_Yugoslavia         float64
Length: 96, dtype: object
ID             int64
No_Pation      int64
Gender       float64
AGE            int64
Urea         float64
Cr             int64
HbA1c        float64
Chol         float64
TG           float64
HDL          float64
LDL          float64
VLDL         float64
BMI          float64
CLASS        float64
dtype: object


In [None]:
for col in adult_num_cols:
    scaler = MinMaxScaler()
    adult_df[col] = scaler.fit_transform(adult_df[[col]])
print(adult_df.head())

for col in diabetes_num_cols:
    scaler = StandardScaler()
    diabetes_df[col] = scaler.fit_transform(diabetes_df[[col]])
print(diabetes_df.head())

        age    fnlwgt  educational-num  relationship  race  gender  \
0  0.109589  0.145129         0.400000           2.0   1.0     0.0   
1  0.287671  0.052451         0.533333           0.0   0.0     0.0   
2  0.150685  0.219649         0.733333           0.0   0.0     0.0   
3  0.369863  0.100153         0.600000           0.0   1.0     0.0   
4  0.013699  0.061708         0.600000           2.0   0.0     1.0   

   capital-gain  capital-loss  hours-per-week  income  ...  \
0      0.000000           0.0        0.397959     0.0  ...   
1      0.000000           0.0        0.500000     0.0  ...   
2      0.000000           0.0        0.397959     1.0  ...   
3      0.076881           0.0        0.397959     1.0  ...   
4      0.000000           0.0        0.295918     0.0  ...   

   native-country_Portugal  native-country_Puerto-Rico  \
0                      0.0                         0.0   
1                      0.0                         0.0   
2                      0.0      