In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15

# All score DataFrame
results = pd.DataFrame([],columns=['Accuracy Score', "Precision Score", "Recall Score", "F1 Score"])

In [None]:
df_heart_1 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_1.csv")
df_heart_2 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_2.csv")
df_heart_3 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_3.csv")
df_heart_6 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_6.csv")

data_1 = pd.concat([df_heart_1, df_heart_2, df_heart_3, df_heart_6],ignore_index=True)
# Drop the 'thal' column
data_1.drop(['thal', 'ca', 'Unnamed: 0'], axis=1, inplace=True)
# Rename multiple columns
data_1.rename(columns={'cp': 'chest pain', 'trestbps': 'resting bps', 'chol': 'cholesterol', 
                       'fbs': 'fasting blood sugar', 'restecg': 'resting ecg', 'thalach': 'max heart rate',
                       'exang': 'exercise angina', 'target': 'disease'}, inplace=True)

df_heart_4 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_4.csv")
df_heart_5 = pd.read_csv("./datasets/heart_disease/heart_disease_dataset_5.csv")
df_heart_5.rename(columns={'resting bp s': 'resting bps'}, inplace=True)

data_2 = pd.concat([df_heart_4, df_heart_5],ignore_index=True)
data_2.drop(['Unnamed: 0'], axis=1, inplace=True)
data_2.rename(columns={'ST slope': 'slope', 'target': 'disease', 'chest pain type': 'chest pain'}, inplace=True)

df_heart_di = pd.concat([data_1, data_2],ignore_index=True)

mapping = {0: 'non-heart disease', 1: 'heart disease'}
df_heart_di['disease'] = df_heart_di['disease'].map(mapping)
df_heart_di


In [None]:
df_diabetes_1 = pd.read_csv("./datasets/diabetes/diabetes_dataset_1.csv")
df_diabetes_2 = pd.read_csv("./datasets/diabetes/diabetes_dataset_2.csv")
df_diabetes_3 = pd.read_csv("./datasets/diabetes/diabetes_dataset_3.csv")
df_diabetes_4 = pd.read_csv("./datasets/diabetes/diabetes_dataset_4.csv")
df_diabetes_5 = pd.read_csv("./datasets/diabetes/diabetes_dataset_5.csv")
df_diabetes_5 = df_diabetes_5.rename(columns={'Diagnosis': 'Outcome'}, inplace=True)
df_diabetes = pd.concat([df_diabetes_1, df_diabetes_2, df_diabetes_3, df_diabetes_4, df_diabetes_5],ignore_index=True)
df_diabetes.rename(columns={'Outcome': 'disease', 'Age': 'age'}, inplace=True)

mapping = {0: 'non-diabetics', 1: 'diabetes'}
df_diabetes['disease'] = df_diabetes['disease'].map(mapping)
df_diabetes

In [None]:
df_anemia_3 = pd.read_csv("./datasets/anemia/anemia_dataset_3.csv")
selected_columns = ['GENDER', 'HGB','MCV', 'MCH', 'MCHC', 'HGB_Anemia_Class'] 
df_anemia_3 = df_anemia_3[selected_columns]
df_anemia_3.rename(columns={'GENDER': 'sex', 'HGB': 'hemoglobin', 'HGB_Anemia_Class': 'disease'}, inplace=True)
df_anemia_3[df_anemia_3['disease'] == 1]

df_anemia_1 = pd.read_csv("./datasets/anemia/anemia_dataset_1.csv")
df_anemia_2 = pd.read_csv("./datasets/anemia/anemia_dataset_2.csv")
data1_2 = pd.concat([df_anemia_1, df_anemia_2],ignore_index=True)
data1_2.rename(columns={'Gender': 'sex', 'Hemoglobin': 'hemoglobin', 'Result': 'disease'}, inplace=True)

df_anemia_4 = pd.read_csv("./datasets/anemia/anemia_dataset_4.csv")
df_anemia_4.rename(columns={'Outcome': 'disease'}, inplace=True)

df_anemia_di = pd.concat([df_anemia_3, data1_2, df_anemia_4],ignore_index=True)

mapping = {0: 'non-hgb-anemia', 1: 'hgb_anemia'}
df_anemia_di['disease'] = df_anemia_di['disease'].map(mapping)
df_anemia_di

In [None]:
# df_kidney_7 = pd.read_csv("./datasets/chronic_kidney/kidney_disease_8.csv")
# df_kidney_7

In [None]:
data = pd.concat([df_heart_di, df_diabetes, df_anemia_di],ignore_index=True)
data

In [None]:
# Let's find how many of each class there are in the bar chart.
counts = data['disease'].value_counts()
palette = sns.color_palette()
fig, axes = plt.subplots(figsize=(10,6))
counts.plot(kind="bar", ax=axes, color=palette)
plt.xlabel('diseases',labelpad=10)
for container in axes.containers:
    axes.bar_label(container)
plt.show()

In [None]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, make_scorer, precision_score, f1_score, confusion_matrix, classification_report,accuracy_score, log_loss, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
def standardization(xtrain, xtest):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(xtrain)
    X_test_scaled = scaler.transform(xtest)
    return X_train_scaled, X_test_scaled

In [None]:
# label=LabelEncoder()
# data['Disease']=label.fit_transform(data['Disease'])
disease=data['disease']
features=data.drop('disease',axis=1)
X_train, X_test, Y_train, Y_test=train_test_split(features,disease,test_size=0.2, shuffle=True, random_state=42, stratify=disease)

In [None]:
# Create an imputer object with a chosen strategy (e.g., mean)
imputer = SimpleImputer(strategy='mean')
# Fit the imputer on your training data and transform it
X_train_imputed = imputer.fit_transform(X_train)
# Transform the test data using the imputer fitted on the training data
X_test_imputed = imputer.transform(X_test)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_imputed, Y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test_imputed)

# Evaluate the model
print("Classification Report:\n", classification_report(Y_test, y_pred))
print('accuracy', accuracy_score(Y_test, y_pred))

In [None]:
# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(feature_importances)[::-1]
sorted_features = X_train.columns[sorted_indices]
sorted_importances = feature_importances[sorted_indices]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances, tick_label=sorted_features)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Feature Importance')
plt.title('Feature Importance Plot')
plt.show()

In [None]:
# importan_features = ['Mean Corpuscular Hemoglobin','White Blood Cells','Hematocrit','Red Blood Cells','HDL Cholesterol',
                    #  'ALT','Troponin','Mean Corpuscular Hemoglobin Concentration','AST','Mean Corpuscular Volume','HbA1c','LDL Cholesterol','Cholesterol']

In [None]:
# results.loc['LogisticRegression Model'] = [test_accuracy, precision_test_score, recall_test_score, f1_test_score]
# results