## Good Health And Well-being 👩‍⚕️🫀💉💊🩸
"This project aims to develop an AI-driven solution to identify underserved areas by analyzing socioeconomic and healthcare infrastructure data. By pinpointing areas with limited healthcare resources, we can recommend resource allocation strategies to improve accessibility, ensuring equitable healthcare access and promoting overall community well-being."

#Importing the libraries

In [10]:
import numpy as np #to work with arrays
import pandas as pd #to work with data
import matplotlib.pyplot as plt #to visualize the data

## Importing the dataset

In [11]:
data= pd.read_csv('expanded_healthcare_accessibility_data.csv')# Load dataset


##Data Preprocessing
Encode categorical columns

In [12]:
#Here we implement StandardScaler that is used to standardize numerical features by removing the mean and scaling to unit variance.
# LabelEncoder is used to convert categorical labels (text labels) into numerical values.
from sklearn.preprocessing import StandardScaler, LabelEncoder
for column in ['region_name', 'country']:
    data[column] = LabelEncoder().fit_transform(data[column])


# Scale numerical features for uniformity

In [13]:
scaler = StandardScaler()
numerical_features = ['num_hospitals', 'num_clinics', 'hospital_capacity_per_1000','clinic_capacity_per_1000', 'healthcare_access_index',
                      'healthcare_quality_index', 'population_density','median_income', 'education_index', 'unemployment_rate']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

**Statistics and Visualizations for Exploratory Data Analysis (EDA)**

In [None]:
print("Dataset Info:")
data.info()
print("\nFirst Few Rows of the Dataset:")
print(data.head())

In [None]:
# Summary Statistics
print("\nSummary Statistics:")
print(data.describe())

In [8]:
# Visualizing Distribution of Each Numerical Feature
numerical_features = data.select_dtypes(include=[np.number]).columns


In [None]:
# Plot distribution for each numerical feature
import seaborn as sns
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

In [None]:
# Pairplot for Numerical Features
sns.pairplot(data[numerical_features])
plt.suptitle("Pairplot of Numerical Features", y=1.02)
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data[numerical_features].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

In [None]:
# Boxplot to Detect Outliers in Each Numerical Feature
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=data[feature])
    plt.title(f'Boxplot of {feature}')
    plt.show()

**Define underserved threshold based on the 25th percentile of healthcare_access_index**

In [22]:
threshold = data['healthcare_access_index'].quantile(0.25)
data['underserved'] = (data['healthcare_access_index'] < threshold).astype(int)


In [23]:
# Features and target variable
X = data[numerical_features]  # Use scaled numerical features as predictors
y = data['underserved']

#Splitting the dataset into the Training set and Test set

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Logistic regression model

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score, classification_report# Initial Evaluation
y_pred = model.predict(X_test)
print("Initial Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Initial Accuracy: 0.98
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        74
           1       0.93      1.00      0.96        26

    accuracy                           0.98       100
   macro avg       0.96      0.99      0.97       100
weighted avg       0.98      0.98      0.98       100



**Active learning loop to iteratively improve model by querying uncertain samples.**
    

In [27]:
def active_learning(model, X_train, y_train, X_pool, y_pool, X_test, y_test, n_iterations=10, batch_size=10):

    # Ensure that X_pool and y_pool are DataFrames or Series
    X_train = pd.DataFrame(X_train)
    y_train = pd.Series(y_train)
    X_pool = pd.DataFrame(X_pool)
    y_pool = pd.Series(y_pool)
    X_test = pd.DataFrame(X_test)
    y_test = pd.Series(y_test)

    for i in range(n_iterations):
        # Predict probabilities on the pool set
        probs = model.predict_proba(X_pool)[:, 1]
        uncertainty = np.abs(probs - 0.5)  # uncertainty for binary classification

        # Select samples with highest uncertainty
        uncertain_indices = uncertainty.argsort()[:batch_size]
        X_selected, y_selected = X_pool.iloc[uncertain_indices], y_pool.iloc[uncertain_indices]

        # Add these to the training set
        X_train = pd.concat([X_train, X_selected], ignore_index=True)
        y_train = pd.concat([y_train, y_selected], ignore_index=True)

        # Remove selected samples from pool
        X_pool = X_pool.drop(X_selected.index).reset_index(drop=True)
        y_pool = y_pool.drop(y_selected.index).reset_index(drop=True)
        model.fit(X_train, y_train)# Retrain model

        # Evaluate on test set
        y_test_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_test_pred)
        print(f"Iteration {i + 1}, Test Accuracy: {accuracy}")
        print(confusion_matrix(y_test, y_test_pred))

    return model

In [28]:
# Split remaining data as pool for active learning
X_pool, y_pool = X_train[100:], y_train[100:]
X_train, y_train = X_train[:100], y_train[:100]


In [29]:
# Re-initialize model for active learning process
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)


In [30]:
from sklearn.metrics import accuracy_score, classification_report
y_final_pred = model.predict(X_test)
print("Final Accuracy:", accuracy_score(y_test, y_final_pred))
print(classification_report(y_test, y_final_pred))

Final Accuracy: 0.93
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        74
           1       0.83      0.92      0.87        26

    accuracy                           0.93       100
   macro avg       0.90      0.93      0.91       100
weighted avg       0.93      0.93      0.93       100

