In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data = pd.read_csv("/content/drive/MyDrive/heart_disease_uci.csv")

In [3]:
data

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
id,0
age,0
sex,0
dataset,0
cp,0
trestbps,59
chol,30
fbs,90
restecg,2
thalch,55


In [6]:
data_new = data.drop(["thal","ca","slope"], axis=1)

In [7]:
data_new['fbs'].count()

np.int64(830)

In [8]:
numerical_cols_with_missing = ['trestbps', 'chol', 'thalch', 'oldpeak']
categorical_cols_with_missing = ['fbs', 'restecg', 'exang']

# Impute numerical columns with median
for col in numerical_cols_with_missing:
    if col in data_new.columns:
        median_val = data_new[col].median()
        data_new[col] = data_new[col].fillna(median_val)

# Impute categorical columns with mode
for col in categorical_cols_with_missing:
    if col in data_new.columns:
        mode_val = data_new[col].mode()[0] # .mode() returns a Series, take the first value
        data_new[col] = data_new[col].fillna(mode_val)

  data_new[col] = data_new[col].fillna(mode_val)


In [9]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  920 non-null    float64
 6   chol      920 non-null    float64
 7   fbs       920 non-null    bool   
 8   restecg   920 non-null    object 
 9   thalch    920 non-null    float64
 10  exang     920 non-null    bool   
 11  oldpeak   920 non-null    float64
 12  num       920 non-null    int64  
dtypes: bool(2), float64(4), int64(3), object(4)
memory usage: 81.0+ KB


In [10]:
# 0 -> Risk Yok (0), 1,2,3,4 -> Risk Var (1)
data_new['target'] = data_new['num'].apply(lambda x: 1 if x > 0 else 0)

# Artık 'num', 'id' ve 'dataset' sütunlarına ihtiyacımız yok, onları atalım
data_final = data_new.drop(['num', 'id', 'dataset'], axis=1)

In [11]:
# Kategorik sütunları belirle
categorical_cols = ['sex', 'cp', 'restecg']

# One-Hot Encoding uygula
data_final = pd.get_dummies(data_final, columns=categorical_cols, drop_first=True)

# Bool değerleri (True/False) 1 ve 0'a çevir
data_final = data_final.astype(int)

In [12]:
X = data_final.drop('target', axis=1)
y = data_final['target']

# %80 Eğitim, %20 Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
# 5. Sonuçları Değerlendir
y_pred = rf_model.predict(X_test)
print(f"Random Forest Doğruluğu: %{accuracy_score(y_test, y_pred)*100:.2f}")
print("\nSınıflandırma Raporu:\n", classification_report(y_test, y_pred))

Random Forest Doğruluğu: %83.70

Sınıflandırma Raporu:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81        75
           1       0.88      0.83      0.86       109

    accuracy                           0.84       184
   macro avg       0.83      0.84      0.83       184
weighted avg       0.84      0.84      0.84       184



In [15]:
import joblib

# Modeli kaydet
joblib.dump(rf_model, 'heart_rf_model.pkl')

# Tahmin yaparken sütun sırasının bozulmaması için özellik listesini kaydet
joblib.dump(X.columns.tolist(), 'features.pkl')

print("Model ve özellik listesi başarıyla kaydedildi!")

Model ve özellik listesi başarıyla kaydedildi!
