In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/heart.csv')  # Replace 'heart_data.csv' with your actual file name

# Rename the 'thal' column to 'Thalassemia'
df.rename(columns={'cp': 'ChestPainType'}, inplace=True)
df.rename(columns={'trestbps': 'RestingBP'}, inplace=True)
df.rename(columns={'chol': 'Cholesterol'}, inplace=True)
df.rename(columns={'fbs': 'FastingBS'}, inplace=True)
df.rename(columns={'restecg': 'RestingECG'}, inplace=True)
df.rename(columns={'mhra': 'MaxHR'}, inplace=True)
df.rename(columns={'exang': 'ExerciseAngina'}, inplace=True)
df.rename(columns={'slope': 'ST_Slope'}, inplace=True)
df.rename(columns={'ca': 'coronaryArtery'}, inplace=True)
df.rename(columns={'thal': 'Thalassemia'}, inplace=True)

# Convert the 'sex' column: 1 -> 'Male', 0 -> 'Female'
df['sex'] = df['sex'].map({1: 'Male', 0: 'Female'})

# Convert the 'fbs' column: 1 -> 'Greater than 120 mg/dl', 0 -> 'Less than 120 mg/dl'
df['FastingBS'] = df['FastingBS'].map({1: '121', 0: '120'}) #ok
df['RestingECG'] = df['RestingECG'].map({0: 'Normal', 1: 'ST-T wave abnormality', 2: 'Left ventricular hypertrophy'}) #ok
df['ST_Slope'] = df['ST_Slope'].map({0: 'Up-sloping', 1: 'Flat', 2: 'Down-sloping'})#ok
df['Thalassemia'] = df['Thalassemia'].map({1: 'Normal', 2: 'Fixed Defect', 3: 'Reversible Defect'}) #ok-ok
df['ChestPainType'] = df['ChestPainType'].map({0: 'Typical Angina', 1: 'Atypical Angina', 2: 'Non-Anginal Pain', 3: 'Asymptomatic'})#ok

# Save the updated DataFrame back to a new CSV file
df.to_csv('heart_data_converted.csv', index=False)

print("Conversion complete. Saved to 'heart_data_converted.csv'.")


Conversion complete. Saved to 'heart_data_converted.csv'.


In [2]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('../notebooks/heart_data_converted.csv')

# Select categorical columns
df_cat = df.select_dtypes(include=['object'])
# print(df_cat)
# Initialize LabelEncoder
le = LabelEncoder()

# Initialize an empty DataFrame to store encoded columns
df_encoded = pd.DataFrame()
label_encoders = {}

# Apply LabelEncoder to each categorical column
for col in df_cat.columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_cat[col])
    label_encoders[col] = le

# print(label_encoders)
with open('../encoders/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print(label_encoders)
# Convert encoded columns to one-hot encoding
# df_encoded = pd.get_dummies(df_encoded, drop_first=True)

# Drop the original categorical columns from the original DataFrame
df = df.drop(columns=df_cat.columns)

# Concatenate the one-hot encoded columns with the original DataFrame (excluding categorical columns)
df = pd.concat([df, df_encoded], axis=1)

print(df.head(2))
df.to_csv('../data/encoded.csv',index=False)


{'sex': LabelEncoder(), 'ChestPainType': LabelEncoder(), 'RestingECG': LabelEncoder(), 'ST_Slope': LabelEncoder(), 'Thalassemia': LabelEncoder()}
   age  RestingBP  Cholesterol  FastingBS  MaxHR  ExerciseAngina  oldpeak  \
0   52        125          212        120    168               0      1.0   
1   53        140          203        121    155               1      3.1   

   coronaryArtery  target  sex  ChestPainType  RestingECG  ST_Slope  \
0               2       0    1              3           2         0   
1               0       0    1              3           1         2   

   Thalassemia  
0            2  
1            2  


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import pickle

# Load and preprocess the data
df = pd.read_csv('../data/encoded.csv')

# Print initial data and columns for verification
# print("Initial Data:")
# print(df.head())
# print("Columns:", df.columns)

# Prepare features and target variable
X = df.drop(columns=['target'])
y = df['target']

# print(y)

# # Identify categorical columns
# categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
# print("Categorical Columns:", categorical_columns)

# # Apply LabelEncoder to each categorical column
# label_encoders = {}
# for col in categorical_columns:
#     le = LabelEncoder()
#     le.fit(X[col])
#     label_encoders[col] = le

#     # Transform the column in X with the encoder
#     X[col] = le.transform(X[col])
#     print(f"Encoded column '{col}' with classes: {le.classes_}")

# Save the LabelEncoders
# with open('label_encoders.pkl', 'wb') as f:
#     pickle.dump(label_encoders, f)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_test.iloc[0:5])
# Save the training set
# with open('x_train.pkl', 'wb') as f:
#     pickle.dump(X_train, f)

# Initialize and train the RandomForest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# print(X_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
# print(y_test)
# print(y_pred)

# Save the trained model
joblib.dump(model, '../models/random_forest_model.pkl')

print("Model and encoders saved successfully.")


     age  RestingBP  Cholesterol  FastingBS  MaxHR  ExerciseAngina  oldpeak  \
527   62        124          209        120    163               0      0.0   
359   53        128          216        120    115               0      0.0   
447   55        160          289        120    145               1      0.8   
31    50        120          244        120    162               0      1.1   
621   48        130          256        121    150               1      0.0   

     coronaryArtery  sex  ChestPainType  RestingECG  ST_Slope  Thalassemia  
527               0    0              3           2         0            0  
359               0    0              2           1         0            3  
447               1    1              3           1         1            2  
31                0    0              1           2         0            0  
621               2    1              3           1         0            2  
Classification Report:
              precision    recall  f1-sc

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import pickle

# Load and preprocess the data
df = pd.read_csv('../data/encoded.csv')

# Prepare features and target variable
X = df.drop(columns=['target'])
y = df['target']

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=4, shuffle=True, random_state=42)

# Store results
accuracy_scores = []

# K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the RandomForest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()  # Calculate accuracy
    accuracy_scores.append(accuracy)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Print average accuracy across folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy across {kf.n_splits} folds: {average_accuracy:.2f}")

# Save the trained model
joblib.dump(model, '../models/random_forest_model.pkl')
print("Model saved successfully.")


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       132
           1       1.00      0.98      0.99       125

    accuracy                           0.99       257
   macro avg       0.99      0.99      0.99       257
weighted avg       0.99      0.99      0.99       257

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       122
           1       1.00      1.00      1.00       134

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weighted avg       1.00      1.00      1.00       256

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       122
           1       1.00      1.00      1.00       134

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weigh

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import jellyfish
import numpy as np

# Load and preprocess the data
df = pd.read_csv('../data/encoded.csv')

# Example function to compute similarity scores using Jellyfish
def compute_similarity(column):
    # Example reference string to compare against, adjust as needed
    reference = 'typical angina'
    return column.apply(lambda x: 1 - (jellyfish.levenshtein_distance(x, reference) / max(len(x), len(reference))))

# Example preprocessing of a categorical column (replace 'chest_pain_type' with actual column name)
# df['chest_pain_type'] = compute_similarity(df['chest_pain_type'])

# Prepare features and target variable
X = df.drop(columns=['target'])
y = df['target']

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=4, shuffle=True, random_state=42)

# Store results
accuracy_scores = []

# K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Initialize and train the RandomForest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()  # Calculate accuracy
    accuracy_scores.append(accuracy)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Print average accuracy across folds
average_accuracy = np.mean(accuracy_scores)
print(f"Average Accuracy across {kf.n_splits} folds: {average_accuracy:.2f}")

# Save the trained model
joblib.dump(model, '../models/random_forest_model.pkl')
print("Model saved successfully.")


Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       132
           1       1.00      0.98      0.99       125

    accuracy                           0.99       257
   macro avg       0.99      0.99      0.99       257
weighted avg       0.99      0.99      0.99       257

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       122
           1       1.00      1.00      1.00       134

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weighted avg       1.00      1.00      1.00       256

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       122
           1       1.00      1.00      1.00       134

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weigh

In [10]:
pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-1.1.0-cp311-none-win_amd64.whl.metadata (2.6 kB)
Downloading jellyfish-1.1.0-cp311-none-win_amd64.whl (207 kB)
   ---------------------------------------- 0.0/207.3 kB ? eta -:--:--
   ------- -------------------------------- 41.0/207.3 kB 1.9 MB/s eta 0:00:01
   ---------------------------------------  204.8/207.3 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 207.3/207.3 kB 2.5 MB/s eta 0:00:00
Installing collected packages: jellyfish
Successfully installed jellyfish-1.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
