# Preprocessing Techniques

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,OneHotEncoder

In [2]:
df = pd.read_csv('medical_students_dataset.csv')

In [3]:
df

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,,95.0,109.0,203.0,No,
1,2.0,,Male,152.069157,47.630941,B,,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,,46.234173,O,,98.480008,95.0,,231.0,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,,24.0,Male,176.503260,95.756997,B,30.737254,99.170685,65.0,121.0,130.0,No,No
199996,99997.0,29.0,Female,163.917675,45.225194,,16.831734,97.865785,62.0,125.0,198.0,No,Yes
199997,99998.0,34.0,Female,,99.648914,,33.189303,98.768210,60.0,90.0,154.0,,No
199998,99999.0,30.0,Female,156.446944,50.142824,A,20.486823,98.994212,61.0,106.0,225.0,No,No


In [4]:
df.columns

Index(['Student ID', 'Age', 'Gender', 'Height', 'Weight', 'Blood Type', 'BMI',
       'Temperature', 'Heart Rate', 'Blood Pressure', 'Cholesterol',
       'Diabetes', 'Smoking'],
      dtype='object')

df.info()

In [5]:
df.describe()

Unnamed: 0,Student ID,Age,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol
count,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0
mean,49974.042078,26.021561,174.947103,69.971585,23.338869,98.600948,79.503767,114.558033,184.486361
std,28879.641657,4.890528,14.44756,17.322574,7.033554,0.50053,11.540755,14.403353,37.559678
min,1.0,18.0,150.000041,40.000578,10.074837,96.397835,60.0,90.0,120.0
25%,24971.75,22.0,162.47611,54.969838,17.858396,98.26475,70.0,102.0,152.0
50%,49943.5,26.0,174.899914,69.979384,22.671401,98.599654,80.0,115.0,184.0
75%,74986.0,30.0,187.464417,84.980097,27.997487,98.940543,90.0,127.0,217.0
max,100000.0,34.0,199.998639,99.999907,44.355113,100.824857,99.0,139.0,249.0


# Handling Duplicates

In [6]:
df.duplicated().sum()

np.int64(7644)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.duplicated().sum()

np.int64(0)

# Filling Missing Values

In [9]:
df.isna().sum()

Student ID        19891
Age               19892
Gender            19924
Height            19907
Weight            19901
Blood Type        19916
BMI               19900
Temperature       19911
Heart Rate        19903
Blood Pressure    19892
Cholesterol       19893
Diabetes          19905
Smoking           19897
dtype: int64

In [10]:
df['Student ID'] = df['Student ID'].interpolate(method='linear')
df['Student ID']

0              1.0
1              2.0
2              3.0
3              4.0
4              5.0
            ...   
199995     99996.0
199996     99997.0
199997     99998.0
199998     99999.0
199999    100000.0
Name: Student ID, Length: 192356, dtype: float64

In [11]:
print(df['Age'].describe())

count    172464.000000
mean         26.022254
std           4.890591
min          18.000000
25%          22.000000
50%          26.000000
75%          30.000000
max          34.000000
Name: Age, dtype: float64


In [12]:
df['Age'] = df['Age'].fillna(np.random.randint(18,35))
df['Age']

0         18.0
1         27.0
2         32.0
3         30.0
4         23.0
          ... 
199995    24.0
199996    29.0
199997    34.0
199998    30.0
199999    20.0
Name: Age, Length: 192356, dtype: float64

In [13]:
df['Gender'] = df['Gender'].ffill()
df['Gender'] 

0         Female
1           Male
2         Female
3           Male
4         Female
           ...  
199995      Male
199996    Female
199997    Female
199998    Female
199999    Female
Name: Gender, Length: 192356, dtype: object

In [14]:
simple_imputer = SimpleImputer(strategy='mean')
df['Height'] = simple_imputer.fit_transform(df[['Height']])
df['Height']

0         161.777924
1         152.069157
2         182.537664
3         182.112867
4         174.958363
             ...    
199995    176.503260
199996    163.917675
199997    174.958363
199998    156.446944
199999    153.927409
Name: Height, Length: 192356, dtype: float64

In [15]:
simple_imputer = SimpleImputer()
df['Weight'] = simple_imputer.fit_transform(df[['Weight']])
df['Weight']

0         72.354947
1         47.630941
2         55.741083
3         63.332207
4         46.234173
            ...    
199995    95.756997
199996    45.225194
199997    99.648914
199998    50.142824
199999    99.928405
Name: Weight, Length: 192356, dtype: float64

In [16]:
df['Blood Type'] = df['Blood Type'].bfill()
df['Blood Type']

0         O
1         B
2         A
3         B
4         O
         ..
199995    B
199996    A
199997    A
199998    A
199999    O
Name: Blood Type, Length: 192356, dtype: object

In [17]:
df['BMI'] = df['BMI'].fillna(df['BMI'].mean())
df['BMI']

0         27.645835
1         23.338982
2         16.729017
3         19.096042
4         23.338982
            ...    
199995    30.737254
199996    16.831734
199997    33.189303
199998    20.486823
199999    42.175189
Name: BMI, Length: 192356, dtype: float64

In [18]:
df['Temperature'] = df['Temperature'].fillna(df['Temperature'].mean())
df['Temperature']

0         98.601117
1         98.714977
2         98.260293
3         98.839605
4         98.480008
            ...    
199995    99.170685
199996    97.865785
199997    98.768210
199998    98.994212
199999    98.595817
Name: Temperature, Length: 192356, dtype: float64

In [19]:
df['Heart Rate'] = df['Heart Rate'].fillna(df['Heart Rate'].median())
df['Heart Rate']

0         95.0
1         93.0
2         76.0
3         99.0
4         95.0
          ... 
199995    65.0
199996    62.0
199997    60.0
199998    61.0
199999    95.0
Name: Heart Rate, Length: 192356, dtype: float64

In [20]:
df['Blood Pressure'] = df['Blood Pressure'].fillna(df['Blood Pressure'].median())
df['Blood Pressure']

0         109.0
1         104.0
2         130.0
3         112.0
4         115.0
          ...  
199995    121.0
199996    125.0
199997     90.0
199998    106.0
199999    133.0
Name: Blood Pressure, Length: 192356, dtype: float64

In [21]:
simple_imputer = SimpleImputer(strategy='median')
df['Cholesterol'] = simple_imputer.fit_transform(df[['Cholesterol']])
df['Cholesterol']

0         203.0
1         163.0
2         216.0
3         141.0
4         231.0
          ...  
199995    130.0
199996    198.0
199997    154.0
199998    225.0
199999    132.0
Name: Cholesterol, Length: 192356, dtype: float64

In [22]:
df['Diabetes'] = df['Diabetes'].ffill()
df['Diabetes']

0          No
1          No
2         Yes
3          No
4          No
         ... 
199995     No
199996     No
199997     No
199998     No
199999     No
Name: Diabetes, Length: 192356, dtype: object

In [23]:
df['Smoking'] = df['Smoking'].bfill()
df['Smoking']

0          No
1          No
2          No
3         Yes
4          No
         ... 
199995     No
199996    Yes
199997     No
199998     No
199999     No
Name: Smoking, Length: 192356, dtype: object

In [24]:
df

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,98.601117,95.0,109.0,203.0,No,No
1,2.0,27.0,Male,152.069157,47.630941,B,23.338982,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,4.0,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,174.958363,46.234173,O,23.338982,98.480008,95.0,115.0,231.0,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,99996.0,24.0,Male,176.503260,95.756997,B,30.737254,99.170685,65.0,121.0,130.0,No,No
199996,99997.0,29.0,Female,163.917675,45.225194,A,16.831734,97.865785,62.0,125.0,198.0,No,Yes
199997,99998.0,34.0,Female,174.958363,99.648914,A,33.189303,98.768210,60.0,90.0,154.0,No,No
199998,99999.0,30.0,Female,156.446944,50.142824,A,20.486823,98.994212,61.0,106.0,225.0,No,No


# Feature Engineering

In [25]:
label_encoder = LabelEncoder()
ordinal_encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder()

In [26]:
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Gender']

0         0
1         1
2         0
3         1
4         0
         ..
199995    1
199996    0
199997    0
199998    0
199999    0
Name: Gender, Length: 192356, dtype: int64

In [27]:
encoded_df = pd.get_dummies(df['Blood Type'],prefix='Blood Type')
encoded_df = encoded_df.astype(int)
df = pd.concat([df,encoded_df],axis=1)
df.drop(['Blood Type'],axis=1,inplace=True)
df

Unnamed: 0,Student ID,Age,Gender,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking,Blood Type_A,Blood Type_AB,Blood Type_B,Blood Type_O
0,1.0,18.0,0,161.777924,72.354947,27.645835,98.601117,95.0,109.0,203.0,No,No,0,0,0,1
1,2.0,27.0,1,152.069157,47.630941,23.338982,98.714977,93.0,104.0,163.0,No,No,0,0,1,0
2,3.0,32.0,0,182.537664,55.741083,16.729017,98.260293,76.0,130.0,216.0,Yes,No,1,0,0,0
3,4.0,30.0,1,182.112867,63.332207,19.096042,98.839605,99.0,112.0,141.0,No,Yes,0,0,1,0
4,5.0,23.0,0,174.958363,46.234173,23.338982,98.480008,95.0,115.0,231.0,No,No,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,99996.0,24.0,1,176.503260,95.756997,30.737254,99.170685,65.0,121.0,130.0,No,No,0,0,1,0
199996,99997.0,29.0,0,163.917675,45.225194,16.831734,97.865785,62.0,125.0,198.0,No,Yes,1,0,0,0
199997,99998.0,34.0,0,174.958363,99.648914,33.189303,98.768210,60.0,90.0,154.0,No,No,1,0,0,0
199998,99999.0,30.0,0,156.446944,50.142824,20.486823,98.994212,61.0,106.0,225.0,No,No,1,0,0,0


In [28]:
df['Diabetes'] = ordinal_encoder.fit_transform(df[['Diabetes']])
df['Diabetes']

0         0.0
1         0.0
2         1.0
3         0.0
4         0.0
         ... 
199995    0.0
199996    0.0
199997    0.0
199998    0.0
199999    0.0
Name: Diabetes, Length: 192356, dtype: float64

In [29]:
df['Smoking'] = label_encoder.fit_transform(df['Smoking'])
df['Smoking']

0         0
1         0
2         0
3         1
4         0
         ..
199995    0
199996    1
199997    0
199998    0
199999    0
Name: Smoking, Length: 192356, dtype: int64

In [30]:
df

Unnamed: 0,Student ID,Age,Gender,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking,Blood Type_A,Blood Type_AB,Blood Type_B,Blood Type_O
0,1.0,18.0,0,161.777924,72.354947,27.645835,98.601117,95.0,109.0,203.0,0.0,0,0,0,0,1
1,2.0,27.0,1,152.069157,47.630941,23.338982,98.714977,93.0,104.0,163.0,0.0,0,0,0,1,0
2,3.0,32.0,0,182.537664,55.741083,16.729017,98.260293,76.0,130.0,216.0,1.0,0,1,0,0,0
3,4.0,30.0,1,182.112867,63.332207,19.096042,98.839605,99.0,112.0,141.0,0.0,1,0,0,1,0
4,5.0,23.0,0,174.958363,46.234173,23.338982,98.480008,95.0,115.0,231.0,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,99996.0,24.0,1,176.503260,95.756997,30.737254,99.170685,65.0,121.0,130.0,0.0,0,0,0,1,0
199996,99997.0,29.0,0,163.917675,45.225194,16.831734,97.865785,62.0,125.0,198.0,0.0,1,1,0,0,0
199997,99998.0,34.0,0,174.958363,99.648914,33.189303,98.768210,60.0,90.0,154.0,0.0,0,1,0,0,0
199998,99999.0,30.0,0,156.446944,50.142824,20.486823,98.994212,61.0,106.0,225.0,0.0,0,1,0,0,0


In [31]:
df.describe()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking,Blood Type_A,Blood Type_AB,Blood Type_B,Blood Type_O
count,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0,192356.0
mean,50007.743366,26.123365,0.499802,174.958363,69.980827,23.338982,98.601117,79.559639,114.60977,184.448081,0.099898,0.200883,0.247042,0.247255,0.252605,0.253098
std,28872.792531,4.640377,0.500001,13.683044,16.403026,6.659926,0.47362,10.930664,13.639829,35.55666,0.299865,0.400662,0.431292,0.431417,0.434507,0.434788
min,1.0,18.0,0.0,150.000041,40.000578,10.074837,96.397835,60.0,90.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25000.75,22.0,0.0,163.92816,56.696915,18.404855,98.30841,71.0,104.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50004.5,27.0,0.0,174.958363,69.980827,23.338982,98.601117,80.0,115.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,75024.0,30.0,1.0,186.032826,83.262503,27.231679,98.895268,88.0,126.0,213.0,0.0,0.0,0.0,0.0,1.0,1.0
max,100000.0,34.0,1.0,199.998639,99.999907,44.355113,100.824857,99.0,139.0,249.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192356 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Student ID      192356 non-null  float64
 1   Age             192356 non-null  float64
 2   Gender          192356 non-null  int64  
 3   Height          192356 non-null  float64
 4   Weight          192356 non-null  float64
 5   BMI             192356 non-null  float64
 6   Temperature     192356 non-null  float64
 7   Heart Rate      192356 non-null  float64
 8   Blood Pressure  192356 non-null  float64
 9   Cholesterol     192356 non-null  float64
 10  Diabetes        192356 non-null  float64
 11  Smoking         192356 non-null  int64  
 12  Blood Type_A    192356 non-null  int64  
 13  Blood Type_AB   192356 non-null  int64  
 14  Blood Type_B    192356 non-null  int64  
 15  Blood Type_O    192356 non-null  int64  
dtypes: float64(10), int64(6)
memory usage: 24.9 MB
