# Make the dataset features Numerical(make suitable for Logistic Regression model)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the datasets (make sure to upload them to Colab)
hospital_1_df = pd.read_csv('old_csv/HOSPITAL_1.csv')
hospital_2_df = pd.read_csv('old_csv/HOSPITAL_2.csv')

# List of 17 selected features for the target column "dialysis_encoded"
selected_features = [
    'Age of the patient', 'Blood pressure (mm/Hg)', 'Albumin in urine', 'Sugar in urine',
    'Random blood glucose level (mg/dl)', 'Body Mass Index (BMI)', 'Physical activity level',
    'Duration of diabetes mellitus (years)', 'Duration of hypertension (years)', 'Cystatin C level',
    'C-reactive protein (CRP) level', 'Interleukin-6 (IL-6) level', 'Red blood cells in urine',
    'Pus cells in urine', 'Pus cell clumps in urine', 'Bacteria in urine', 'Pedal edema (yes/no)'
]

# Select only the relevant features and include the target column
hospital_1_selected = hospital_1_df[selected_features + ['dialysis_encoded']]
hospital_2_selected = hospital_2_df[selected_features + ['dialysis_encoded']]

# Convert categorical values to numeric using LabelEncoder for categorical columns
encoder = LabelEncoder()

# Function to convert categorical columns to numeric
def convert_categorical_to_numeric(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = encoder.fit_transform(df[col].astype(str))
    return df

# Apply conversion to both datasets
hospital_1_selected_numeric = convert_categorical_to_numeric(hospital_1_selected)
hospital_2_selected_numeric = convert_categorical_to_numeric(hospital_2_selected)

# Save the transformed datasets as new CSV files
hospital_1_selected_numeric.to_csv('HOSPITAL_1_transformed_with_target.csv', index=False)
hospital_2_selected_numeric.to_csv('HOSPITAL_2_transformed_with_target.csv', index=False)

# Download the transformed files
from google.colab import files
files.download('HOSPITAL_1_transformed_with_target.csv')
files.download('HOSPITAL_2_transformed_with_target.csv')


Project Exhibision

# Step 1: Import Necessary Libraries

In [None]:
# For data handling
import pandas as pd
import numpy as np

# For model training
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# For encoding categorical data
from sklearn.preprocessing import LabelEncoder


# Step 2: Load Datasets and Preprocess Data

In [None]:
# Load the two hospital datasets
hospital_1 = pd.read_csv('new_csv/HOSPITAL_1_transformed_with_target.csv')
hospital_2 = pd.read_csv('new_csv/HOSPITAL_2_transformed_with_target.csv')

# Separate features and target for both hospitals
X_1 = hospital_1.drop(columns=['dialysis_encoded'])
y_1 = hospital_1['dialysis_encoded']

X_2 = hospital_2.drop(columns=['dialysis_encoded'])
y_2 = hospital_2['dialysis_encoded']

# Step 3: Train Logistic Regression Model for Each Hospital

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Initialize Logistic Regression models for both hospitals
model_1 = LogisticRegression(max_iter=1000000)
model_2 = LogisticRegression(max_iter=1000000)

# Initialize the SimpleImputer to replace missing values with the mean
imputer = SimpleImputer(strategy='mean')

# Apply imputation to both datasets
X_1_imputed = imputer.fit_transform(X_1)
X_2_imputed = imputer.fit_transform(X_2)

# Train-test split for better generalization (optional, can skip if you don't want to split)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1_imputed, y_1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2_imputed, y_2, test_size=0.2, random_state=42)

# Train the models
model_1.fit(X_train_1, y_train_1)
model_2.fit(X_train_2, y_train_2)

# Predict on the test set (you can also predict on the training set if preferred)
y_pred_1 = model_1.predict(X_test_1)
y_pred_2 = model_2.predict(X_test_2)


# Step 4: Evaluate Models for Each Hospital

In [None]:
# Evaluate the models (optional)
from sklearn.metrics import accuracy_score
print(f"Accuracy for Hospital 1: {accuracy_score(y_test_1, y_pred_1)}")
print(f"Accuracy for Hospital 2: {accuracy_score(y_test_2, y_pred_2)}")

# Confusion Matrix for Hospital 1
conf_matrix_1 = confusion_matrix(y_test_1, y_pred_1)
print(f"Confusion Matrix for Hospital 1: \n{conf_matrix_1}")

# Confusion Matrix for Hospital 2
conf_matrix_2 = confusion_matrix(y_test_2, y_pred_2)
print(f"Confusion Matrix for Hospital 2: \n{conf_matrix_2}")