In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install scipy
!pip install statsmodels



In [None]:
import pandas as pd

df= pd.read_csv('/Users/shahadaleissa/hyper_code/Dataset/hypertension_data.csv')

In [None]:
#perform statistical analysis 
df.describe()

Unnamed: 0,Patient_Number,Blood_Pressure_Abnormality,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Age,BMI,Sex,Pregnancy,Smoking,Physical_activity,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders
count,2000.0,2000.0,2000.0,1908.0,2000.0,2000.0,2000.0,442.0,2000.0,2000.0,2000.0,1758.0,2000.0,2000.0,2000.0
mean,1000.5,0.4935,11.710035,0.494817,46.5585,30.0815,0.496,0.450226,0.5095,25254.4245,24926.097,251.008532,2.0125,0.505,0.4435
std,577.494589,0.500083,2.186701,0.291736,17.107832,11.761208,0.500109,0.49808,0.500035,14015.439623,14211.692586,143.651884,0.823822,0.5001,0.496922
min,1.0,0.0,8.1,0.0,18.0,10.0,0.0,0.0,0.0,628.0,22.0,0.0,1.0,0.0,0.0
25%,500.75,0.0,10.1475,0.24,32.0,20.0,0.0,0.0,0.0,13605.75,13151.75,126.25,1.0,0.0,0.0
50%,1000.5,0.0,11.33,0.49,46.0,30.0,0.0,0.0,1.0,25353.0,25046.5,250.0,2.0,1.0,0.0
75%,1500.25,1.0,12.945,0.74,62.0,40.0,1.0,1.0,1.0,37382.25,36839.75,377.75,3.0,1.0,1.0
max,2000.0,1.0,17.56,1.0,75.0,50.0,1.0,1.0,1.0,49980.0,49976.0,499.0,3.0,1.0,1.0


In [None]:
#number of samples for each class
df.count()

Patient_Number                   2000
Blood_Pressure_Abnormality       2000
Level_of_Hemoglobin              2000
Genetic_Pedigree_Coefficient     1908
Age                              2000
BMI                              2000
Sex                              2000
Pregnancy                         442
Smoking                          2000
Physical_activity                2000
salt_content_in_the_diet         2000
alcohol_consumption_per_day      1758
Level_of_Stress                  2000
Chronic_kidney_disease           2000
Adrenal_and_thyroid_disorders    2000
dtype: int64

In [None]:
# check any missing values
df.isnull().sum()

Patient_Number                      0
Blood_Pressure_Abnormality          0
Level_of_Hemoglobin                 0
Genetic_Pedigree_Coefficient       92
Age                                 0
BMI                                 0
Sex                                 0
Pregnancy                        1558
Smoking                             0
Physical_activity                   0
salt_content_in_the_diet            0
alcohol_consumption_per_day       242
Level_of_Stress                     0
Chronic_kidney_disease              0
Adrenal_and_thyroid_disorders       0
dtype: int64

In [None]:
df['Genetic_Pedigree_Coefficient'].dtype
df['alcohol_consumption_per_day'].dtype


dtype('float64')

In [None]:
#dropping pregnancy column since most of the values are missing
df.drop('Pregnancy', axis=1, inplace=True)

<h2>Imputing null values with KNN</h2>

The number of neighbors to look for. Taking a low k will increase the influence of noise and the results are going to be less generalizable. On the other hand, taking a high k will tend to blur local effects which are exactly what we are looking for. It is also recommended to take an odd k for binary classes to avoid ties.

In [None]:
#using KNN imputer to fill missing values for float and int columns
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
selected_columns=['Genetic_Pedigree_Coefficient','alcohol_consumption_per_day']
imputed_data = imputer.fit_transform(df[selected_columns])

In [None]:
imputed_df = pd.DataFrame(imputed_data, columns = selected_columns)

#combine the imputed data with the original data
df = df.drop(columns=selected_columns)
df = df.join(imputed_df)
df.isnull().sum()


Patient_Number                   0
Blood_Pressure_Abnormality       0
Level_of_Hemoglobin              0
Age                              0
BMI                              0
Sex                              0
Smoking                          0
Physical_activity                0
salt_content_in_the_diet         0
Level_of_Stress                  0
Chronic_kidney_disease           0
Adrenal_and_thyroid_disorders    0
Genetic_Pedigree_Coefficient     0
alcohol_consumption_per_day      0
dtype: int64

In [None]:
#dropping Patient_Number column since it is not useful for the analysis
df.drop('Patient_Number', axis=1, inplace=True)
#renaming Blood_Pressure_Abnormality to class
df.rename(columns={'Blood_Pressure_Abnormality':'Class'}, inplace=True)
df.head()

Unnamed: 0,Class,Level_of_Hemoglobin,Age,BMI,Sex,Smoking,Physical_activity,salt_content_in_the_diet,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders,Genetic_Pedigree_Coefficient,alcohol_consumption_per_day
0,1,11.28,34,23,1,0,45961,48071,2,1,1,0.9,336.333333
1,0,9.75,54,33,1,0,26106,25333,3,0,0,0.23,205.0
2,1,10.79,70,49,0,0,9995,29465,2,1,0,0.91,67.0
3,0,11.0,71,50,0,0,10635,7439,1,1,0,0.43,242.0
4,1,14.17,52,19,0,0,15619,49644,2,0,0,0.83,397.0


In [None]:
#statistical analysis for the dataset after cleaning 
df.describe()

Unnamed: 0,Class,Level_of_Hemoglobin,Age,BMI,Sex,Smoking,Physical_activity,salt_content_in_the_diet,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders,Genetic_Pedigree_Coefficient,alcohol_consumption_per_day
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.4935,11.710035,46.5585,30.0815,0.496,0.5095,25254.4245,24926.097,2.0125,0.505,0.4435,0.49391,252.167214
std,0.500083,2.186701,17.107832,11.761208,0.500109,0.500035,14015.439623,14211.692586,0.823822,0.5001,0.496922,0.286772,137.943008
min,0.0,8.1,18.0,10.0,0.0,0.0,628.0,22.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,10.1475,32.0,20.0,0.0,0.0,13605.75,13151.75,1.0,0.0,0.0,0.25,139.0
50%,0.0,11.33,46.0,30.0,0.0,1.0,25353.0,25046.5,2.0,1.0,0.0,0.49,253.0
75%,1.0,12.945,62.0,40.0,1.0,1.0,37382.25,36839.75,3.0,1.0,1.0,0.73,370.0
max,1.0,17.56,75.0,50.0,1.0,1.0,49980.0,49976.0,3.0,1.0,1.0,1.0,499.0


In [None]:
df

In [35]:
#saving the cleaned data
df.to_csv('/Users/shahadaleissa/hyper_code/Dataset/cleaned_hypertension_data.csv', index=False)