In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

In [3]:
original_data = pd.read_csv('heart.csv')

In [5]:
original_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [7]:
train_data, test_data = train_test_split(original_data, test_size=0.3, random_state=42)

In [11]:
# Generate synthetic data
n_samples = test_data.shape[0]
n_features = train_data.shape[1] - 1  # excluding the target column

X_synthetic, y_synthetic = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_features,
    n_redundant=0,
    n_clusters_per_class=2,
    random_state=42
)

In [13]:
synthetic_data = pd.DataFrame(X_synthetic, columns=train_data.columns[:-1])
synthetic_data['HeartDisease'] = y_synthetic

In [15]:
# Define mappings for categorical features
sex_mapping = {0: 'F', 1: 'M'}
chest_pain_mapping = {0: 'ATA', 1: 'NAP', 2: 'ASY', 3: 'TA'}
resting_ecg_mapping = {0: 'Normal', 1: 'ST', 2: 'LVH'}
exercise_angina_mapping = {0: 'N', 1: 'Y'}
st_slope_mapping = {0: 'Up', 1: 'Flat', 2: 'Down'}

In [17]:
# Apply the mappings to transform synthetic data
synthetic_data['Sex'] = synthetic_data['Sex'].apply(lambda x: sex_mapping[int(x > 0)])
synthetic_data['ChestPainType'] = synthetic_data['ChestPainType'].apply(lambda x: chest_pain_mapping[int(x) % 4])
synthetic_data['RestingECG'] = synthetic_data['RestingECG'].apply(lambda x: resting_ecg_mapping[int(x) % 3])
synthetic_data['ExerciseAngina'] = synthetic_data['ExerciseAngina'].apply(lambda x: exercise_angina_mapping[int(x > 0)])
synthetic_data['ST_Slope'] = synthetic_data['ST_Slope'].apply(lambda x: st_slope_mapping[int(x) % 3])

In [19]:
# Scale numeric features using the original training set statistics
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
for feature in numeric_features:
    mean = train_data[feature].mean()
    std = train_data[feature].std()
    synthetic_data[feature] = synthetic_data[feature] * std + mean


In [21]:
synthetic_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,81.328849,F,ATA,165.457443,527.677903,1.142467,LVH,57.548658,Y,-2.646878,Flat,1
1,62.339648,F,ATA,159.921926,0.318063,0.353954,ST,160.764652,N,-0.637790,Up,0
2,15.766583,F,ATA,145.449909,-1.031126,0.551223,LVH,126.346544,N,-1.624709,Up,0
3,46.256773,M,TA,177.246406,73.026292,-0.250564,Normal,222.184397,Y,4.258561,Up,0
4,61.947640,M,ATA,117.390513,-33.498358,0.077550,Normal,144.464874,Y,2.404094,Down,0
...,...,...,...,...,...,...,...,...,...,...,...,...
271,73.963296,F,TA,158.261951,240.563147,-0.154719,Normal,154.624174,Y,-2.053866,Flat,1
272,29.549197,M,TA,89.899902,138.834220,0.120172,Normal,149.312250,Y,2.345520,Flat,0
273,24.405942,M,NAP,90.080150,-341.599188,-0.139181,Normal,216.180977,Y,3.525936,Up,1
274,73.101409,F,NAP,113.682821,355.863230,0.008896,Normal,66.674632,Y,-2.679576,Flat,1


In [49]:
synthetic_data['Sex'] = synthetic_data['Sex'].astype('category')
synthetic_data['Sex'] = synthetic_data['Sex'].cat.codes

synthetic_data['ChestPainType'] = synthetic_data['ChestPainType'].astype('category')
synthetic_data['ChestPainType'] = synthetic_data['ChestPainType'].cat.codes

synthetic_data['RestingECG'] = synthetic_data['RestingECG'].astype('category')
synthetic_data['RestingECG'] = synthetic_data['RestingECG'].cat.codes

synthetic_data['ExerciseAngina'] = synthetic_data['ExerciseAngina'].astype('category')
synthetic_data['ExerciseAngina'] = synthetic_data['ExerciseAngina'].cat.codes

synthetic_data['ST_Slope'] = synthetic_data['ST_Slope'].astype('category')
synthetic_data['ST_Slope'] = synthetic_data['ST_Slope'].cat.codes

synthetic_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,81.328849,0,1,165.457443,527.677903,1.142467,0,57.548658,1,-2.646878,1,1
1,62.339648,0,1,159.921926,0.318063,0.353954,2,160.764652,0,-0.637790,2,0
2,15.766583,0,1,145.449909,-1.031126,0.551223,0,126.346544,0,-1.624709,2,0
3,46.256773,1,3,177.246406,73.026292,-0.250564,1,222.184397,1,4.258561,2,0
4,61.947640,1,1,117.390513,-33.498358,0.077550,1,144.464874,1,2.404094,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
271,73.963296,0,3,158.261951,240.563147,-0.154719,1,154.624174,1,-2.053866,1,1
272,29.549197,1,3,89.899902,138.834220,0.120172,1,149.312250,1,2.345520,1,0
273,24.405942,1,2,90.080150,-341.599188,-0.139181,1,216.180977,1,3.525936,2,1
274,73.101409,0,2,113.682821,355.863230,0.008896,1,66.674632,1,-2.679576,1,1


In [51]:
X = synthetic_data.drop(columns = 'HeartDisease')
y = synthetic_data['HeartDisease']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 20)

In [55]:
from sklearn.linear_model import LogisticRegression

In [57]:
model = LogisticRegression(max_iter = 5000).fit(X_train, y_train)

In [59]:
model.predict(X_train)

array([0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1])

In [63]:
model.score(X_train, y_train)

0.7305699481865285

In [67]:
synthetic_data.to_csv('synthetic_heart_data.csv', index=False)