In [1]:
#Importing The important libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
#Importing the Hypertension Dataset
df = pd.read_csv("c:/users/sakshi yadav/Downloads/hypertension_dataset.csv")

In [4]:
#Shape of the dataset
df.shape

(1985, 11)

In [5]:
#Print the dataset
df.head()

Unnamed: 0,Age,Salt_Intake,Stress_Score,BP_History,Sleep_Duration,BMI,Medication,Family_History,Exercise_Level,Smoking_Status,Has_Hypertension
0,69,8.0,9,Normal,6.4,25.8,,Yes,Low,Non-Smoker,Yes
1,32,11.7,10,Normal,5.4,23.4,,No,Low,Non-Smoker,No
2,78,9.5,3,Normal,7.1,18.7,,No,Moderate,Non-Smoker,No
3,38,10.0,10,Hypertension,4.2,22.1,ACE Inhibitor,No,Low,Non-Smoker,Yes
4,41,9.8,1,Prehypertension,5.8,16.2,Other,No,Moderate,Non-Smoker,No


In [6]:
#Checking the null values of the dataset
df.isnull().sum()

Age                   0
Salt_Intake           0
Stress_Score          0
BP_History            0
Sleep_Duration        0
BMI                   0
Medication          799
Family_History        0
Exercise_Level        0
Smoking_Status        0
Has_Hypertension      0
dtype: int64

In [7]:
#Checking the dtypes of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1985 entries, 0 to 1984
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1985 non-null   int64  
 1   Salt_Intake       1985 non-null   float64
 2   Stress_Score      1985 non-null   int64  
 3   BP_History        1985 non-null   object 
 4   Sleep_Duration    1985 non-null   float64
 5   BMI               1985 non-null   float64
 6   Medication        1186 non-null   object 
 7   Family_History    1985 non-null   object 
 8   Exercise_Level    1985 non-null   object 
 9   Smoking_Status    1985 non-null   object 
 10  Has_Hypertension  1985 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 170.7+ KB


In [8]:
#Columns of this dataset
df.columns

Index(['Age', 'Salt_Intake', 'Stress_Score', 'BP_History', 'Sleep_Duration',
       'BMI', 'Medication', 'Family_History', 'Exercise_Level',
       'Smoking_Status', 'Has_Hypertension'],
      dtype='object')

In [9]:
#Printing the unique value of this columns
df.Medication.unique()

array([nan, 'ACE Inhibitor', 'Other', 'Beta Blocker', 'Diuretic'],
      dtype=object)

In [10]:
#Filling the mode for this medication columns to fill the nan value
mode = df['Medication'].mode()[0]

In [11]:
df['Medication'].fillna(value=mode, inplace=True)

In [12]:
df.isnull().sum()

Age                 0
Salt_Intake         0
Stress_Score        0
BP_History          0
Sleep_Duration      0
BMI                 0
Medication          0
Family_History      0
Exercise_Level      0
Smoking_Status      0
Has_Hypertension    0
dtype: int64

In [13]:
#Checking the Duplicates
df.duplicated().sum()

np.int64(0)

In [14]:
df.shape

(1985, 11)

In [15]:
df.head()

Unnamed: 0,Age,Salt_Intake,Stress_Score,BP_History,Sleep_Duration,BMI,Medication,Family_History,Exercise_Level,Smoking_Status,Has_Hypertension
0,69,8.0,9,Normal,6.4,25.8,Beta Blocker,Yes,Low,Non-Smoker,Yes
1,32,11.7,10,Normal,5.4,23.4,Beta Blocker,No,Low,Non-Smoker,No
2,78,9.5,3,Normal,7.1,18.7,Beta Blocker,No,Moderate,Non-Smoker,No
3,38,10.0,10,Hypertension,4.2,22.1,ACE Inhibitor,No,Low,Non-Smoker,Yes
4,41,9.8,1,Prehypertension,5.8,16.2,Other,No,Moderate,Non-Smoker,No


In [16]:
df.columns

Index(['Age', 'Salt_Intake', 'Stress_Score', 'BP_History', 'Sleep_Duration',
       'BMI', 'Medication', 'Family_History', 'Exercise_Level',
       'Smoking_Status', 'Has_Hypertension'],
      dtype='object')

In [17]:
df.BP_History.unique()

array(['Normal', 'Hypertension', 'Prehypertension'], dtype=object)

In [18]:
df.Medication.unique()

array(['Beta Blocker', 'ACE Inhibitor', 'Other', 'Diuretic'], dtype=object)

In [19]:
df.Exercise_Level.unique()

array(['Low', 'Moderate', 'High'], dtype=object)

In [20]:
#Encoding the Columns of Catergorical data
bp = {'Normal':0, 'Hypertension':1, 'Prehypertension':2}
md = {'Beta Blocker':0, 'ACE Inhibitor':1, 'Other':2, 'Diuretic':3}
exl = {'Low':0, 'Moderate':1, 'High':2}
def bpEncode(val):
    return bp[val]
def mdEncode(val):
    return md[val]
def exlEncode(val):
    return exl[val]

In [21]:
df.BP_History = df.BP_History.map(bpEncode)

In [22]:
df.Medication = df.Medication.map(mdEncode)

In [23]:
df.Exercise_Level = df.Exercise_Level.map(exlEncode)

In [24]:
#Encode binary categorical columns (Yes/No or Smoker/Non-Smoker)
binCol = ['Family_History', 'Smoking_Status', 'Has_Hypertension']
for col in binCol:
    df[col] = df[col].map({'Yes':1, 'No':0, 'Smoker':1, 'Non-Smoker':0})

In [25]:
df.head()

Unnamed: 0,Age,Salt_Intake,Stress_Score,BP_History,Sleep_Duration,BMI,Medication,Family_History,Exercise_Level,Smoking_Status,Has_Hypertension
0,69,8.0,9,0,6.4,25.8,0,1,0,0,1
1,32,11.7,10,0,5.4,23.4,0,0,0,0,0
2,78,9.5,3,0,7.1,18.7,0,0,1,0,0
3,38,10.0,10,1,4.2,22.1,1,0,0,0,1
4,41,9.8,1,2,5.8,16.2,2,0,1,0,0


In [26]:
#Separate features and target
X = df.drop('Has_Hypertension', axis=1)
y = df['Has_Hypertension']

In [27]:
#Train_test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [28]:
#Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [29]:
#Classification Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVC': SVC(),
    'NaiveBayes': GaussianNB(),
    'GradientBoosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(verbosity=0),
    'LightGBM': LGBMClassifier(verbose=-1),
    'MultilayerPerception': MLPClassifier(max_iter=500)
}

In [30]:
# Train and evalute each model
print("Accuracy Scores With Standardization:\n")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {round(acc * 100, 2)}%")

Accuracy Scores With Standardization:

LogisticRegression: Accuracy = 72.04%
KNN: Accuracy = 82.12%
DecisionTree: Accuracy = 92.7%
RandomForest: Accuracy = 94.21%
SVC: Accuracy = 87.66%
NaiveBayes: Accuracy = 78.59%
GradientBoosting: Accuracy = 97.98%
XGBoost: Accuracy = 51.89%
LightGBM: Accuracy = 98.24%
MultilayerPerception: Accuracy = 90.43%


In [33]:
# Normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
#Train and evaluate each model
print("Accuracy Scores with Normalization:\n")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {round(acc * 100, 2)}%")

Accuracy Scores with Normalization:

LogisticRegression: Accuracy = 71.79%
KNN: Accuracy = 86.15%
DecisionTree: Accuracy = 93.2%
RandomForest: Accuracy = 95.72%
SVC: Accuracy = 87.91%
NaiveBayes: Accuracy = 78.84%
GradientBoosting: Accuracy = 98.49%
XGBoost: Accuracy = 98.24%
LightGBM: Accuracy = 98.49%
MultilayerPerception: Accuracy = 88.66%
