In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [2]:
# Load and preprocess the dataset
data = pd.read_csv('heart_disease_uci.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


### Data Preprocessing

In [4]:
# Drop columns with low relevance (e.g., 'id', 'dataset') and high null values (e.g., 'thal', 'ca', slope)
data_cleaned = data.drop(columns=[ 'dataset', 'thal', 'ca', 'slope'], errors='ignore')

In [5]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   cp        920 non-null    object 
 4   trestbps  861 non-null    float64
 5   chol      890 non-null    float64
 6   fbs       830 non-null    object 
 7   restecg   918 non-null    object 
 8   thalch    865 non-null    float64
 9   exang     865 non-null    object 
 10  oldpeak   858 non-null    float64
 11  num       920 non-null    int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 86.4+ KB


In [6]:
# separating the categorical and numerical columns
categorical_features = ['sex','cp','fbs', 'restecg', 'exang']
numeric_features = ['age','trestbps', 'chol', 'oldpeak', 'thalch']


#### One hot encoding

In [8]:
# Initialize an empty list to store the one-hot encoded columns
encoded_columns = []

# Loop through each categorical column, apply one-hot encoding, and handle NaN values
for col in categorical_features:
    # Perform one-hot encoding on the current column
    encoded_cat_df = pd.get_dummies(data_cleaned[col], prefix=col, dummy_na=False, drop_first=True)    
    
    # Manually set NaN in the one-hot encoded columns where the original value was NaN
    encoded_cat_df[data_cleaned[col].isna()] = np.nan
    # Append the encoded DataFrame to the list of encoded columns
    encoded_columns.append(encoded_cat_df)

# Concatenate all encoded columns back into a single DataFrame
data_cat_encoded = pd.concat(encoded_columns, axis=1)

# Display the final DataFrame
print(data_cat_encoded);

     sex_Male  cp_atypical angina  cp_non-anginal  cp_typical angina fbs_True  \
0        True               False           False               True     True   
1        True               False           False              False    False   
2        True               False           False              False    False   
3        True               False            True              False    False   
4       False                True           False              False    False   
..        ...                 ...             ...                ...      ...   
915     False               False           False              False     True   
916      True               False           False               True    False   
917      True               False           False              False     True   
918      True               False           False              False     True   
919      True                True           False              False    False   

    restecg_normal restecg_

  encoded_cat_df[data_cleaned[col].isna()] = np.nan
  encoded_cat_df[data_cleaned[col].isna()] = np.nan
  encoded_cat_df[data_cleaned[col].isna()] = np.nan
  encoded_cat_df[data_cleaned[col].isna()] = np.nan


In [9]:
data_encoded = pd.concat([data_cat_encoded, data_cleaned[numeric_features]], axis=1)

In [10]:
print(data_encoded.info())
data_encoded.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   sex_Male                  920 non-null    bool   
 1   cp_atypical angina        920 non-null    bool   
 2   cp_non-anginal            920 non-null    bool   
 3   cp_typical angina         920 non-null    bool   
 4   fbs_True                  830 non-null    object 
 5   restecg_normal            918 non-null    object 
 6   restecg_st-t abnormality  918 non-null    object 
 7   exang_True                865 non-null    object 
 8   age                       920 non-null    int64  
 9   trestbps                  861 non-null    float64
 10  chol                      890 non-null    float64
 11  oldpeak                   858 non-null    float64
 12  thalch                    865 non-null    float64
dtypes: bool(4), float64(4), int64(1), object(4)
memory usage: 68.4+ K

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch
0,True,False,False,True,True,False,False,False,63,145.0,233.0,2.3,150.0
1,True,False,False,False,False,False,False,True,67,160.0,286.0,1.5,108.0
2,True,False,False,False,False,False,False,True,67,120.0,229.0,2.6,129.0
3,True,False,True,False,False,True,False,False,37,130.0,250.0,3.5,187.0
4,False,True,False,False,False,False,False,False,41,130.0,204.0,1.4,172.0


#### Scaling

In [12]:
# Replace `0` with NaN for specific columns (chol and trestbps only)
columns_to_replace_zeros = ['chol', 'trestbps']
data_encoded[columns_to_replace_zeros] = data_encoded[columns_to_replace_zeros].replace(0, np.nan)

In [13]:
# Identify numerical columns and apply MinMaxScaler BEFORE imputation
numerical_cols = data_encoded.select_dtypes(include=['float64', 'int64']).columns
data_encoded_scaled=data_encoded.copy()
scaler = MinMaxScaler()
data_encoded_scaled[numerical_cols] = scaler.fit_transform(data_encoded_scaled[numerical_cols])

In [14]:
data_encoded_scaled.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch
0,True,False,False,True,True,False,False,False,0.714286,0.541667,0.285714,0.556818,0.633803
1,True,False,False,False,False,False,False,True,0.795918,0.666667,0.388031,0.465909,0.338028
2,True,False,False,False,False,False,False,True,0.795918,0.333333,0.277992,0.590909,0.485915
3,True,False,True,False,False,True,False,False,0.183673,0.416667,0.318533,0.693182,0.894366
4,False,True,False,False,False,False,False,False,0.265306,0.416667,0.22973,0.454545,0.788732


#### KNN imputation

In [16]:
# Step 3: Handle missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
data_cleaned_imputed = pd.DataFrame(imputer.fit_transform(data_encoded_scaled), columns=data_encoded.columns)

# Step 4: Ensure categorical columns are converted back to integers (if necessary)
categorical_cols = [col for col in data_encoded.columns if col not in numerical_cols]
for col in categorical_cols:
    data_cleaned_imputed[col] = data_cleaned_imputed[col].round().astype(int)

data_cleaned_imputed.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch
0,1,0,0,1,1,0,0,0,0.714286,0.541667,0.285714,0.556818,0.633803
1,1,0,0,0,0,0,0,1,0.795918,0.666667,0.388031,0.465909,0.338028
2,1,0,0,0,0,0,0,1,0.795918,0.333333,0.277992,0.590909,0.485915
3,1,0,1,0,0,1,0,0,0.183673,0.416667,0.318533,0.693182,0.894366
4,0,1,0,0,0,0,0,0,0.265306,0.416667,0.22973,0.454545,0.788732


In [17]:
data_knn_imputed=data_cleaned_imputed.copy()
data_knn_imputed.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch
0,1,0,0,1,1,0,0,0,0.714286,0.541667,0.285714,0.556818,0.633803
1,1,0,0,0,0,0,0,1,0.795918,0.666667,0.388031,0.465909,0.338028
2,1,0,0,0,0,0,0,1,0.795918,0.333333,0.277992,0.590909,0.485915
3,1,0,1,0,0,1,0,0,0.183673,0.416667,0.318533,0.693182,0.894366
4,0,1,0,0,0,0,0,0,0.265306,0.416667,0.22973,0.454545,0.788732


In [18]:
# Reversing the scaling to get the original value of the numerical columns
data_knn_imputed[numerical_cols]=scaler.inverse_transform(data_knn_imputed[numerical_cols])
data_knn_imputed.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch
0,1,0,0,1,1,0,0,0,63.0,145.0,233.0,2.3,150.0
1,1,0,0,0,0,0,0,1,67.0,160.0,286.0,1.5,108.0
2,1,0,0,0,0,0,0,1,67.0,120.0,229.0,2.6,129.0
3,1,0,1,0,0,1,0,0,37.0,130.0,250.0,3.5,187.0
4,0,1,0,0,0,0,0,0,41.0,130.0,204.0,1.4,172.0


In [19]:
# Validate that there are no missing values in the dataset
missing_values_summary = data_knn_imputed.isnull().sum()

# Display columns with missing values (if any)
missing_values = missing_values_summary[missing_values_summary > 0]

if missing_values.empty:
    print("No missing values in the dataset.")
else:
    print("Missing values found in the following columns:")
    print(missing_values)

No missing values in the dataset.


In [20]:
# adding back the num column to separate the severity of the cases
data_df=pd.concat([data_knn_imputed, data_cleaned['num']], axis=1)
data_df.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch,num
0,1,0,0,1,1,0,0,0,63.0,145.0,233.0,2.3,150.0,0
1,1,0,0,0,0,0,0,1,67.0,160.0,286.0,1.5,108.0,2
2,1,0,0,0,0,0,0,1,67.0,120.0,229.0,2.6,129.0,1
3,1,0,1,0,0,1,0,0,37.0,130.0,250.0,3.5,187.0,0
4,0,1,0,0,0,0,0,0,41.0,130.0,204.0,1.4,172.0,0


In [21]:
#if not os.path.exists('cate_encoded.csv'):
#    data_df.to_csv('cate_encoded.csv')

In [22]:
data_df[numerical_cols].describe()

Unnamed: 0,age,trestbps,chol,oldpeak,thalch
count,920.0,920.0,920.0,920.0,920.0
mean,53.51087,132.625652,246.29,0.894978,136.977826
std,9.424685,18.119003,53.139957,1.062393,25.417548
min,28.0,80.0,85.0,-2.6,60.0
25%,47.0,120.0,214.0,0.0,120.0
50%,54.0,130.0,241.0,0.6,138.7
75%,60.0,140.45,271.25,1.5,156.0
max,77.0,200.0,603.0,6.2,202.0


#### Outlier removal

In [24]:
# Function to remove outliers outside the range IQR*3

def remove_outliers_iqr(df, columns):
    # Initialize a DataFrame to hold rows without outliers
    df_no_outliers = df.copy()

    for col in columns:
        if df[col].dtype in ['float64', 'int64']:  # Only process numeric columns
            # Calculate Q1 (25th percentile) and Q3 (75th percentile)
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1

            # Calculate the lower and upper bounds using IQR * 3
            lower_bound = Q1 - 3 * IQR
            upper_bound = Q3 + 3 * IQR

            # Remove rows where values in the column are outside the bounds
            df_no_outliers = df_no_outliers[~((df_no_outliers[col] < lower_bound) | (df_no_outliers[col] > upper_bound))]

    return df_no_outliers

In [25]:
classes=['trestbps', 'chol', 'oldpeak', 'thalch']
data_noOutlier=remove_outliers_iqr(data_df, classes).reset_index(drop=True)

In [26]:
data_noOutlier.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   sex_Male                  911 non-null    int32  
 1   cp_atypical angina        911 non-null    int32  
 2   cp_non-anginal            911 non-null    int32  
 3   cp_typical angina         911 non-null    int32  
 4   fbs_True                  911 non-null    int32  
 5   restecg_normal            911 non-null    int32  
 6   restecg_st-t abnormality  911 non-null    int32  
 7   exang_True                911 non-null    int32  
 8   age                       911 non-null    float64
 9   trestbps                  911 non-null    float64
 10  chol                      911 non-null    float64
 11  oldpeak                   911 non-null    float64
 12  thalch                    911 non-null    float64
 13  num                       911 non-null    int64  
dtypes: float64

In [27]:
data_noOutlier[numerical_cols].describe()

Unnamed: 0,age,trestbps,chol,oldpeak,thalch
count,911.0,911.0,911.0,911.0,911.0
mean,53.531284,132.653787,244.045884,0.891965,137.043469
std,9.41234,18.148376,46.877189,1.051096,25.418093
min,28.0,80.0,85.0,-2.6,60.0
25%,47.0,120.0,214.0,0.0,120.0
50%,54.0,130.0,240.0,0.6,139.0
75%,60.0,140.5,270.0,1.5,156.0
max,77.0,200.0,417.0,5.6,202.0


### Association rules

In [29]:
# Step 1: Define binning thresholds for numerical columns
# Example thresholds; adjust based on domain knowledge & analysis
age_bins = [0, 40, 60, 100]
age_labels = ['<40', '40-60', '>60']

chol_bins = [0, 200, 240, 500]  
chol_labels = ['Normal', 'Borderline', 'High']

trestbps_bins = [0, 80,  130, 140, 300]  
trestbps_labels = ['Low', 'Normal', 'Elevated', 'High']

thalach_bins = [0, 100, 150, 200]  
thalach_labels = ['Low', 'Normal', 'High']

oldpeak_bins = [-3, 0, 2, 6]  
oldpeak_labels = ['No Depression', 'Mild', 'Severe']

In [30]:
df=data_noOutlier.copy()

In [31]:
df['age'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
df['chol'] = pd.cut(df['chol'], bins=chol_bins, labels=chol_labels, right=False)
df['trestbps'] = pd.cut(df['trestbps'], bins=trestbps_bins, labels=trestbps_labels, right=False)
df['thalch'] = pd.cut(df['thalch'], bins=thalach_bins, labels=thalach_labels, right=False)
df['oldpeak'] = pd.cut(df['oldpeak'], bins=oldpeak_bins, labels=oldpeak_labels, right=False)

In [32]:
df.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,age,trestbps,chol,oldpeak,thalch,num
0,1,0,0,1,1,0,0,0,>60,High,Borderline,Severe,High,0
1,1,0,0,0,0,0,0,1,>60,High,High,Mild,Normal,2
2,1,0,0,0,0,0,0,1,>60,Normal,Borderline,Severe,Normal,1
3,1,0,1,0,0,1,0,0,<40,Elevated,High,Severe,High,0
4,0,1,0,0,0,0,0,0,40-60,Elevated,Borderline,Mild,High,0


In [33]:
# one hot encoding the numerical bunned columns
binned_cols = ['age', 'chol', 'trestbps', 'thalch', 'oldpeak']
encoded_binned = pd.get_dummies(df[binned_cols], drop_first=False)

In [34]:
df_encoded = pd.concat([df.drop(columns=binned_cols), encoded_binned], axis=1).astype('int')

In [35]:
df_encoded.head()

Unnamed: 0,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,num,age_<40,...,trestbps_Low,trestbps_Normal,trestbps_Elevated,trestbps_High,thalch_Low,thalch_Normal,thalch_High,oldpeak_No Depression,oldpeak_Mild,oldpeak_Severe
0,1,0,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
1,1,0,0,0,0,0,0,1,2,0,...,0,0,0,1,0,1,0,0,1,0
2,1,0,0,0,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1
3,1,0,1,0,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0


In [36]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   sex_Male                  911 non-null    int32
 1   cp_atypical angina        911 non-null    int32
 2   cp_non-anginal            911 non-null    int32
 3   cp_typical angina         911 non-null    int32
 4   fbs_True                  911 non-null    int32
 5   restecg_normal            911 non-null    int32
 6   restecg_st-t abnormality  911 non-null    int32
 7   exang_True                911 non-null    int32
 8   num                       911 non-null    int32
 9   age_<40                   911 non-null    int32
 10  age_40-60                 911 non-null    int32
 11  age_>60                   911 non-null    int32
 12  chol_Normal               911 non-null    int32
 13  chol_Borderline           911 non-null    int32
 14  chol_High                 911 non-null    

In [37]:
# Re-introducing the dropped columns while one hot encoding the categorical columns previously
columns_to_add=['sex_Female','cp_asymptomatic','fbs_False','restecg_lv hypertrophy','exang_False']
categorical_features = ['sex','cp','fbs', 'restecg', 'exang']

In [38]:
df_cat_encoded=pd.DataFrame()
for col in columns_to_add:
    feature_name = col.split('_')[0]

    # Identify the one-hot encoded columns that are part of the same feature
    feature_columns = [col for col in df_encoded.columns if col.startswith(feature_name)]

    df_cat_encoded[col]=np.where(df_encoded[feature_columns].sum(axis=1) == 0, 1, 0)
    df_cat_encoded[feature_columns]=df_encoded[feature_columns]


In [39]:
df_cat_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   sex_Female                911 non-null    int32
 1   sex_Male                  911 non-null    int32
 2   cp_asymptomatic           911 non-null    int32
 3   cp_atypical angina        911 non-null    int32
 4   cp_non-anginal            911 non-null    int32
 5   cp_typical angina         911 non-null    int32
 6   fbs_False                 911 non-null    int32
 7   fbs_True                  911 non-null    int32
 8   restecg_lv hypertrophy    911 non-null    int32
 9   restecg_normal            911 non-null    int32
 10  restecg_st-t abnormality  911 non-null    int32
 11  exang_False               911 non-null    int32
 12  exang_True                911 non-null    int32
dtypes: int32(13)
memory usage: 46.4 KB


In [40]:
cols_in_df=df_cat_encoded.columns
num_cols_df=[col for col in df_encoded.columns if col not in cols_in_df]

In [41]:
cols_in_df

Index(['sex_Female', 'sex_Male', 'cp_asymptomatic', 'cp_atypical angina',
       'cp_non-anginal', 'cp_typical angina', 'fbs_False', 'fbs_True',
       'restecg_lv hypertrophy', 'restecg_normal', 'restecg_st-t abnormality',
       'exang_False', 'exang_True'],
      dtype='object')

In [42]:
num_cols_df

['num',
 'age_<40',
 'age_40-60',
 'age_>60',
 'chol_Normal',
 'chol_Borderline',
 'chol_High',
 'trestbps_Low',
 'trestbps_Normal',
 'trestbps_Elevated',
 'trestbps_High',
 'thalch_Low',
 'thalch_Normal',
 'thalch_High',
 'oldpeak_No Depression',
 'oldpeak_Mild',
 'oldpeak_Severe']

In [43]:
df_full_encoded=pd.DataFrame()
df_full_encoded[cols_in_df]=df_cat_encoded[cols_in_df]
df_full_encoded[num_cols_df]=df_encoded[num_cols_df]
df_full_encoded=df_full_encoded.drop(columns='num')
df_full_encoded['num']=df_encoded['num']
df_full_encoded.head()

Unnamed: 0,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_False,fbs_True,restecg_lv hypertrophy,restecg_normal,...,trestbps_Normal,trestbps_Elevated,trestbps_High,thalch_Low,thalch_Normal,thalch_High,oldpeak_No Depression,oldpeak_Mild,oldpeak_Severe,num
0,0,1,0,0,0,1,0,1,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,1,0,0,0,1,0,1,0,...,0,0,1,0,1,0,0,1,0,2
2,0,1,1,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,1,1
3,0,1,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
4,1,0,0,1,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [44]:
if not os.path.exists('df_full_encoded.csv'):
    df_full_encoded.to_csv('df_full_encoded.csv')

#### Applying Association Rules

In [46]:
df_asso_rules=df_full_encoded[df_full_encoded['num']>0].drop(columns='num').reset_index(drop=True)
df_asso_rules = df_asso_rules.drop(columns=['index'], errors='ignore')
df_asso_rules.head()


Unnamed: 0,sex_Female,sex_Male,cp_asymptomatic,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_False,fbs_True,restecg_lv hypertrophy,restecg_normal,...,trestbps_Low,trestbps_Normal,trestbps_Elevated,trestbps_High,thalch_Low,thalch_Normal,thalch_High,oldpeak_No Depression,oldpeak_Mild,oldpeak_Severe
0,0,1,1,0,0,0,1,0,1,0,...,0,0,0,1,0,1,0,0,1,0
1,0,1,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,0,0,1
2,1,0,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,1
3,0,1,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
4,0,1,1,0,0,0,0,1,1,0,...,0,0,0,1,0,0,1,0,0,1


#### Frequent Itemsets

In [48]:
# Step 3: Generate frequent itemsets using the Apriori algorithm
# Setting a minimum support threshold 
min_support = 0.3
frequent_itemsets = apriori(df_asso_rules, min_support=min_support, use_colnames=True)




In [49]:
print(frequent_itemsets)

      support                                           itemsets
0    0.902584                                         (sex_Male)
1    0.769384                                  (cp_asymptomatic)
2    0.791252                                        (fbs_False)
3    0.554672                                   (restecg_normal)
4    0.409543                                      (exang_False)
..        ...                                                ...
141  0.316103  (oldpeak_Mild, fbs_False, exang_True, cp_asymp...
142  0.318091  (oldpeak_Mild, fbs_False, age_40-60, cp_asympt...
143  0.332008  (oldpeak_Mild, fbs_False, cp_asymptomatic, tha...
144  0.306163  (sex_Male, thalch_Normal, fbs_False, cp_asympt...
145  0.300199  (sex_Male, thalch_Normal, oldpeak_Mild, fbs_Fa...

[146 rows x 2 columns]


In [50]:
frequent_items_indexed = frequent_itemsets.set_index('itemsets')
frequent_items_indexed

Unnamed: 0_level_0,support
itemsets,Unnamed: 1_level_1
(sex_Male),0.902584
(cp_asymptomatic),0.769384
(fbs_False),0.791252
(restecg_normal),0.554672
(exang_False),0.409543
...,...
"(oldpeak_Mild, fbs_False, exang_True, cp_asymptomatic)",0.316103
"(oldpeak_Mild, fbs_False, age_40-60, cp_asymptomatic)",0.318091
"(oldpeak_Mild, fbs_False, cp_asymptomatic, thalch_Normal)",0.332008
"(sex_Male, thalch_Normal, fbs_False, cp_asymptomatic, exang_True)",0.306163


In [51]:
#frequent_items_indexed.plot(kind='bar')

In [52]:
print(frequent_itemsets['itemsets'].apply(type).unique())

[<class 'frozenset'>]


#### Association Rules

In [54]:
# Step 4: Generate association rules
# Define minimum confidence threshold (e.g., 0.6 for 60% confidence)
min_confidence = 0.1
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence,num_itemsets = 5)

# Sort rules by lift
rules = rules.sort_values(by='lift', ascending=False)

In [55]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
822,"(sex_Male, exang_True)","(fbs_False, cp_asymptomatic, thalch_Normal)",0.536779,0.435388,0.306163,0.570370,1.310029,1.0,0.072456,1.314184,0.510897,0.459701,0.239071,0.636783
815,"(fbs_False, cp_asymptomatic, thalch_Normal)","(sex_Male, exang_True)",0.435388,0.536779,0.306163,0.703196,1.310029,1.0,0.072456,1.560697,0.419151,0.459701,0.359261,0.636783
750,"(thalch_Normal, fbs_False, cp_asymptomatic)",(exang_True),0.435388,0.590457,0.332008,0.762557,1.291469,1.0,0.074930,1.724805,0.399722,0.478510,0.420224,0.662423
759,(exang_True),"(thalch_Normal, fbs_False, cp_asymptomatic)",0.590457,0.435388,0.332008,0.562290,1.291469,1.0,0.074930,1.289922,0.551073,0.478510,0.224759,0.662423
833,(exang_True),"(sex_Male, fbs_False, cp_asymptomatic, thalch_...",0.590457,0.401590,0.306163,0.518519,1.291162,1.0,0.069041,1.242851,0.550624,0.446377,0.195398,0.640447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,(oldpeak_Mild),"(sex_Male, fbs_False, exang_True)",0.795229,0.449304,0.333996,0.420000,0.934779,1.0,-0.023304,0.949476,-0.254138,0.366812,-0.053213,0.581681
52,(fbs_False),(trestbps_High),0.791252,0.413519,0.304175,0.384422,0.929636,1.0,-0.023023,0.952733,-0.266103,0.337748,-0.049612,0.560000
53,(trestbps_High),(fbs_False),0.413519,0.791252,0.304175,0.735577,0.929636,1.0,-0.023023,0.789445,-0.114305,0.337748,-0.266712,0.560000
246,(chol_High),"(sex_Male, thalch_Normal)",0.558648,0.626243,0.322068,0.576512,0.920590,1.0,-0.027782,0.882570,-0.163492,0.373272,-0.133054,0.545399


In [56]:
print("Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Association Rules:
                                     antecedents  \
822                       (sex_Male, exang_True)   
815  (fbs_False, cp_asymptomatic, thalch_Normal)   
750  (thalch_Normal, fbs_False, cp_asymptomatic)   
759                                 (exang_True)   
833                                 (exang_True)   
..                                           ...   
662                               (oldpeak_Mild)   
52                                   (fbs_False)   
53                               (trestbps_High)   
246                                  (chol_High)   
243                    (sex_Male, thalch_Normal)   

                                           consequents   support  confidence  \
822        (fbs_False, cp_asymptomatic, thalch_Normal)  0.306163    0.570370   
815                             (sex_Male, exang_True)  0.306163    0.703196   
750                                       (exang_True)  0.332008    0.762557   
759        (thalch_Normal, fbs_False

In [57]:
rules_lift = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets = 5)

rules_lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(sex_Male),(exang_True),0.902584,0.590457,0.536779,0.594714,1.007209,1.0,0.003842,1.010502,0.073469,0.561331,0.010393,0.751902
1,(exang_True),(sex_Male),0.590457,0.902584,0.536779,0.909091,1.007209,1.0,0.003842,1.071571,0.017476,0.561331,0.066790,0.751902
2,(sex_Male),(age_40-60),0.902584,0.632207,0.576541,0.638767,1.010376,1.0,0.005921,1.018159,0.105419,0.601660,0.017835,0.775358
3,(age_40-60),(sex_Male),0.632207,0.902584,0.576541,0.911950,1.010376,1.0,0.005921,1.106362,0.027922,0.601660,0.096137,0.775358
4,(chol_Borderline),(sex_Male),0.339960,0.902584,0.316103,0.929825,1.030180,1.0,0.009261,1.388171,0.044385,0.341202,0.279628,0.640022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,"(fbs_False, thalch_Normal)","(sex_Male, oldpeak_Mild, cp_asymptomatic)",0.540755,0.548708,0.300199,0.555147,1.011735,1.0,0.003482,1.014475,0.025257,0.380353,0.014269,0.551124
652,"(fbs_False, cp_asymptomatic)","(sex_Male, oldpeak_Mild, thalch_Normal)",0.610338,0.483101,0.300199,0.491857,1.018123,1.0,0.005344,1.017230,0.045682,0.378446,0.016938,0.556628
653,(sex_Male),"(oldpeak_Mild, fbs_False, cp_asymptomatic, tha...",0.902584,0.332008,0.300199,0.332599,1.001781,1.0,0.000534,1.000886,0.018246,0.321277,0.000885,0.618395
654,(thalch_Normal),"(sex_Male, fbs_False, oldpeak_Mild, cp_asympto...",0.679920,0.427435,0.300199,0.441520,1.032953,1.0,0.009577,1.025220,0.099667,0.371921,0.024600,0.571923


In [127]:
# Function to run Apriori and association rules and save the results
# str is an optional parameter: if given a ny value, it will save the file with that name else it will only print the results

def assoc_rules(df,support,confidence,lift=1,str="none"):
    
    frequent_itemsets = apriori(df, min_support=support, use_colnames=True)
    frequent_items_indexed = frequent_itemsets.set_index('itemsets')
    if str!="none":
        if not os.path.exists(f"freq_items_{str}.csv"):
            frequent_items_indexed.to_csv(f"freq_items_{str}.csv")
    print("frequent_items_indexed\n") 
    print(frequent_items_indexed)
    min_confidence = 0.1
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence,num_itemsets = 5)
    rules = rules.sort_values(by='confidence', ascending=False)
    if str!="none":
        if not os.path.exists(f"rules_{str}.csv"):
            rules.to_csv(f"rules_{str}.csv")
    print("\nAssociation rules:\n")
    print(rules)    
    #rules_lift = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets = 5)
    #print(pd.DataFrame(rules_lift))
    

In [129]:
# Applying the apriori and association rules on all records with any severity of heart disease i.e. 1-4
df_1_4=df_full_encoded[df_full_encoded['num']>0].drop(columns='num').reset_index(drop=True)
assoc_rules(df_1_4,0.3,0.1,str="num_1_4")

frequent_items_indexed

                                                     support
itemsets                                                    
(sex_Male)                                          0.902584
(cp_asymptomatic)                                   0.769384
(fbs_False)                                         0.791252
(restecg_normal)                                    0.554672
(exang_False)                                       0.409543
...                                                      ...
(oldpeak_Mild, fbs_False, exang_True, cp_asympt...  0.316103
(oldpeak_Mild, fbs_False, age_40-60, cp_asympto...  0.318091
(oldpeak_Mild, fbs_False, cp_asymptomatic, thal...  0.332008
(sex_Male, thalch_Normal, fbs_False, cp_asympto...  0.306163
(sex_Male, thalch_Normal, oldpeak_Mild, fbs_Fal...  0.300199

[146 rows x 1 columns]

Association rules:

                                  antecedents  \
17                          (trestbps_Normal)   
160              (trestbps_Normal, fbs_F



In [131]:
# Applying the apriori and association rules on all records with num=4 severity of heart disease
df_4=df_full_encoded[df_full_encoded['num']==4].drop(columns='num').reset_index(drop=True)
assoc_rules(df_4,0.3,0.1,str="num_4")


frequent_items_indexed

                                                     support
itemsets                                                    
(sex_Male)                                          0.928571
(cp_asymptomatic)                                   0.821429
(fbs_False)                                         0.785714
(restecg_lv hypertrophy)                            0.464286
(exang_False)                                       0.428571
...                                                      ...
(sex_Male, fbs_False, exang_True, thalch_Normal)    0.357143
(sex_Male, fbs_False, oldpeak_Severe, exang_True)   0.321429
(sex_Male, fbs_False, oldpeak_Severe, thalch_No...  0.357143
(fbs_False, exang_True, cp_asymptomatic, thalch...  0.321429
(sex_Male, thalch_Normal, fbs_False, cp_asympto...  0.321429

[117 rows x 1 columns]

Association rules:

                                          antecedents  \
280                             (age_>60, exang_True)   
337          (fbs_False,



In [133]:
# Applying the apriori and association rules on all records with num=3 severity of heart disease
df_3=df_full_encoded[df_full_encoded['num']==3].drop(columns='num').reset_index(drop=True)
assoc_rules(df_3,0.3,0.1,str="num_3")


frequent_items_indexed

                                                     support
itemsets                                                    
(sex_Male)                                          0.933962
(cp_asymptomatic)                                   0.773585
(fbs_False)                                         0.698113
(fbs_True)                                          0.301887
(restecg_normal)                                    0.471698
...                                                      ...
(oldpeak_Mild, fbs_False, exang_True, cp_asympt...  0.301887
(oldpeak_Mild, fbs_False, cp_asymptomatic, thal...  0.301887
(oldpeak_Mild, exang_True, cp_asymptomatic, tha...  0.301887
(oldpeak_Mild, fbs_False, exang_True, thalch_No...  0.311321
(sex_Male, thalch_Normal, fbs_False, cp_asympto...  0.330189

[128 rows x 1 columns]

Association rules:

                                    antecedents  \
460  (oldpeak_Mild, chol_High, cp_asymptomatic)   
150                 (restecg_normal,



In [135]:
# Applying the apriori and association rules on all records with num=2 severity of heart disease
df_2=df_full_encoded[df_full_encoded['num']==2].drop(columns='num').reset_index(drop=True)
assoc_rules(df_3,0.3,0.1,str="num_2")

frequent_items_indexed

                                                     support
itemsets                                                    
(sex_Male)                                          0.933962
(cp_asymptomatic)                                   0.773585
(fbs_False)                                         0.698113
(fbs_True)                                          0.301887
(restecg_normal)                                    0.471698
...                                                      ...
(oldpeak_Mild, fbs_False, exang_True, cp_asympt...  0.301887
(oldpeak_Mild, fbs_False, cp_asymptomatic, thal...  0.301887
(oldpeak_Mild, exang_True, cp_asymptomatic, tha...  0.301887
(oldpeak_Mild, fbs_False, exang_True, thalch_No...  0.311321
(sex_Male, thalch_Normal, fbs_False, cp_asympto...  0.330189

[128 rows x 1 columns]

Association rules:

                                    antecedents  \
460  (oldpeak_Mild, chol_High, cp_asymptomatic)   
150                 (restecg_normal,



In [137]:
# Applying the apriori and association rules on all records with num=1 severity of heart disease
df_1=df_full_encoded[df_full_encoded['num']==1].drop(columns='num').reset_index(drop=True)
assoc_rules(df_3,0.3,0.1,str="num_1")

frequent_items_indexed

                                                     support
itemsets                                                    
(sex_Male)                                          0.933962
(cp_asymptomatic)                                   0.773585
(fbs_False)                                         0.698113
(fbs_True)                                          0.301887
(restecg_normal)                                    0.471698
...                                                      ...
(oldpeak_Mild, fbs_False, exang_True, cp_asympt...  0.301887
(oldpeak_Mild, fbs_False, cp_asymptomatic, thal...  0.301887
(oldpeak_Mild, exang_True, cp_asymptomatic, tha...  0.301887
(oldpeak_Mild, fbs_False, exang_True, thalch_No...  0.311321
(sex_Male, thalch_Normal, fbs_False, cp_asympto...  0.330189

[128 rows x 1 columns]

Association rules:

                                    antecedents  \
460  (oldpeak_Mild, chol_High, cp_asymptomatic)   
150                 (restecg_normal,

