In [2]:
import pandas as pd
import numpy as np
import random

In [9]:
# function to generate realistic random data

def generate_realistic_data(num_rows):
    np.random.seed(0)
    random.seed(0)
    
    data = {
        'Patient_ID': range(1, num_rows +1),
        'Age': np.random.randint(20,85, size= num_rows), 
        'Ethnicity': np.random.choice(['Asian', 'Hispanic', 'White', 'Black'], size=num_rows),
        'Socioeconomic_Status': np.random.choice(['Low', 'Middle', 'High'], size=num_rows),
        'Geographic_Location': np.random.choice(['Urban', 'Rural'], size=num_rows),
        'Insurance_Status': np.random.choice(['Yes', 'No'], size=num_rows),
        'Family_Medical_History': np.random.choice(['Hypertension', 'Asthma', 'Heart Disease', 'Diabetes', 'None'], size=num_rows),
        'Past_Medical_Condition': np.random.choice(['Diabetes', 'Asthma', 'None', 'Hypertension'], size=num_rows),
        'Previous_Surgeries': np.random.choice(['Yes', 'No'], size=num_rows),
        'Chronic_Illness_History': np.random.choice(['Yes', 'No'], size=num_rows),
        'Allergies': np.random.choice(['None', 'Penicillin', 'Sulfa', 'Dust'], size=num_rows),
        'Dietary_Habits': np.random.choice(['High-carb diet', 'Balanced diet', 'Low-carb diet', 'Vegan diet', 'High-fat diet'], size=num_rows),
        'Physical_Activity': np.random.choice(['Regular', 'Light exercise', 'Sedentary', 'Moderate', 'High'], size=num_rows),
        'Sleep_Patterns': np.random.choice(['Good', 'Normal', 'Poor'], size=num_rows),
        'Substance_Use': np.random.choice(['Alcohol', 'Smoking', 'None'], size=num_rows),
        'Stress_Levels': np.random.choice(['Low', 'Moderate', 'High'], size=num_rows),
        'Medication_Type': np.random.choice(['Metformin', 'Lisinopril', 'Aspirin', 'Albuterol', 'Insulin', None], size=num_rows),
        'Dosage': np.random.choice(['500mg', '20mg', '100mg', '90mcg', '30u', '40u', None], size=num_rows),
        'Adherence': np.random.randint(50,100, size=num_rows),
        'Side_Effects': np.random.choice(['None', 'Nausea', 'Headache', 'Dizziness', 'Weight Gain'], size=num_rows),
        'Vital_Signs': [f'{random.randint(100,1500)}/{random.randint(60,95)}' for _ in range(num_rows)],
        'Lab_Test_Results': np.random.choice(['Normal', 'Glucose: 140', 'Cholesterol: 200', 'Glucose: 160', 'Glucose: 180', 'Glucose: 150', 'Glucose: 170'], size=num_rows),
        'Imaging_Results': np.random.choice(['Normal', 'Elevated'], size=num_rows),
        'Symptoms': np.random.choice(['None', 'Fatigue', 'Cough', 'Chest Pain', 'Shortness of Breath', 'Dizziness'], size=num_rows),
        'Complications': np.random.choice(['None', 'Respiratory', 'Foot Ulcers', 'None'], size=num_rows),
        'Engagement_Level': np.random.choice(['Low', 'Medium', 'High'], size=num_rows),
        'Compliance': np.random.randint(50, 100, size=num_rows),
        'Care_Coordination': np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], size=num_rows),
        'Use_of_Apps': np.random.choice(['Yes', 'No'], size=num_rows),
        'Mental_Health': np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], size=num_rows),
        'Social_Support': np.random.choice(['Low', 'Moderate', 'High'], size=num_rows),
        'Quality_of_Life': np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], size=num_rows),
        'Cost_of_Treatment': np.random.randint(500, 2000, size=num_rows),

    }
    
    df = pd.DataFrame(data)
    
    # Introduce null values randomly
    
    for col in df.columns:
        if col != 'Patient_ID':
            
            df.loc[df.sample(frac=0.1).index, col] = None
        
    # Introduce outliers
    
    outlier_indices = np.random.choice(df.index, size=int(0.05 * num_rows), replace=False)
    df.loc[outlier_indices, 'Cost_of_Treatment'] = df['Cost_of_Treatment'] * 10  # Extreme cost of outliers
    
    return df

# Generate the data
num_rows = 5000
df = generate_realistic_data(num_rows)


In [10]:
df.to_csv("C:\Mentorship\Healthcare_data.csv", index=False)

In [11]:
df.isnull().sum()

Patient_ID                    0
Age                         500
Ethnicity                   500
Socioeconomic_Status        500
Geographic_Location         500
Insurance_Status            500
Family_Medical_History      500
Past_Medical_Condition      500
Previous_Surgeries          500
Chronic_Illness_History     500
Allergies                   500
Dietary_Habits              500
Physical_Activity           500
Sleep_Patterns              500
Substance_Use               500
Stress_Levels               500
Medication_Type            1212
Dosage                     1181
Adherence                   500
Side_Effects                500
Vital_Signs                 500
Lab_Test_Results            500
Imaging_Results             500
Symptoms                    500
Complications               500
Engagement_Level            500
Compliance                  500
Care_Coordination           500
Use_of_Apps                 500
Mental_Health               500
Social_Support              500
Quality_

In [5]:
df

Unnamed: 0,Patient_ID,Age,Ethnicity,Socioeconomic_Status,Geographic_Location,Insurance_Status,Family_Medical_History,Past_Medical_Condition,Previous_Surgeries,Chronic_Illness_History,...,Symptoms,Complications,Engagement_Level,Compliance,Care_Coordination,Use_of_Apps,Mental_Health,Social_Support,Quality_of_Life,Cost_of_Treatment
0,1.0,64.0,,High,Rural,Yes,Hypertension,Diabetes,No,Yes,...,,,Low,94.0,Poor,Yes,Poor,High,Good,801.0
1,2.0,67.0,Black,Low,Urban,Yes,Heart Disease,Asthma,Yes,No,...,,Respiratory,High,98.0,Poor,,Good,,Excellent,666.0
2,3.0,,White,Low,Urban,Yes,Hypertension,,Yes,,...,Dizziness,,High,83.0,Fair,Yes,Fair,Moderate,Excellent,772.0
3,4.0,29.0,Black,High,Urban,Yes,Diabetes,Diabetes,Yes,No,...,,,,58.0,Fair,No,Excellent,Moderate,,515.0
4,5.0,,Asian,Low,Urban,Yes,Hypertension,Hypertension,Yes,No,...,Shortness of Breath,Foot Ulcers,High,,Good,Yes,Fair,Low,Fair,537.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996.0,70.0,Asian,Middle,Urban,,Asthma,Diabetes,Yes,No,...,Shortness of Breath,Respiratory,Medium,,Poor,Yes,Poor,Moderate,Good,1723.0
996,997.0,73.0,Hispanic,Middle,,No,,Asthma,Yes,Yes,...,Chest Pain,,Low,99.0,Excellent,,Good,Low,Poor,1939.0
997,998.0,30.0,White,High,Rural,Yes,Diabetes,Hypertension,Yes,No,...,,Respiratory,,74.0,,No,Poor,Low,Poor,
998,999.0,63.0,Black,Middle,Urban,No,Heart Disease,Hypertension,No,No,...,Chest Pain,,Medium,78.0,Fair,No,Good,Moderate,Fair,11530.0


In [11]:
# set random seed for reproducibility

np.random.seed(42)

# number of rows

num_rows = 1000

# generating data sample

data1 = {
    'PatientID': np.arange(1, num_rows + 1),
    'DiabetesType': np.random.choice(['Type 1', 'Type 2', 'Prediabetes'], num_rows),
    'Age': np.random.randint(20,85, num_rows),
    'Ethnicity': np.random.choice(['Asian', 'Hispanic', 'White', 'Black'], num_rows),
    'AnnualMedicalExpenditures': np.random.normal(loc=8000, scale=3000, size=num_rows).round(2),
    'DirectMedicalCosts': np.random.normal(loc=2500, scale=2000, size=num_rows).round(2),
    'IndirectCosts': np.random.normal(loc=2000, scale=1500, size=num_rows).round(2),
    'EmploymentStatus': np.random.choice(['Employed', 'Unemployed', 'Retired'], num_rows),
    'HealthcareUtilization': np.random.randint(1, 12, num_rows),  # Number of visits per year
    'InsulinCosts': np.random.normal(loc=1500, scale=500, size=num_rows).round(2),
    'InsulinCosts': np.random.normal(loc=1500, scale=500, size=num_rows).round(2),
    'ProductivityLosses': np.random.normal(loc=1000, scale=700, size=num_rows).round(2)

}

# Adding some noise and outliers

for col in ['AnnualMedicalExpenditures', 'DirectMedicalCosts', 'IndirectCosts', 'InsulinCosts', 'ProductivityLosses']:
    noise_indices = np.random.choice(num_rows, size=int(num_rows * 0.05), replace=False)
    data1[col] = np.array(data1[col]) # convert to numpy array for easy modification
    data1[col][noise_indices] *=np.random.uniform(1.5, 3.0, size=len(noise_indices))
    
# create data frame

df1 = pd.DataFrame(data1)

# introducing some null values randomly

null_indices = np.random.choice(num_rows, size=int(num_rows * 0.03), replace=False)
df1.loc[null_indices, 'AnnualMedicalExpenditures'] = np.nan
df1.loc[null_indices, 'DirectMedicalCosts'] = np.nan



In [12]:
df1.head()

Unnamed: 0,PatientID,DiabetesType,Age,Ethnicity,AnnualMedicalExpenditures,DirectMedicalCosts,IndirectCosts,EmploymentStatus,HealthcareUtilization,InsulinCosts,ProductivityLosses
0,1,Prediabetes,22,Asian,4826.85,436.53,3138.64,Unemployed,10,1768.79,-205.79
1,2,Type 1,50,Asian,7876.95,4726.72,442.87,Unemployed,8,1776.5,1085.79
2,3,Prediabetes,59,Asian,8788.703521,2570.25,3438.08,Employed,3,2624.72,1316.47
3,4,Prediabetes,55,White,11521.18,282.01,2512.86,Employed,8,1046.19,73.47
4,5,Type 1,43,White,5414.24,-1002.74,1867.13,Employed,6,2084.08,1994.27


In [1]:
# download cleaned data from SQL 

In [2]:
!pip install pandas mysql-connector-python

INFO: pip is looking at multiple versions of pandas to determine which version is compatible with other requirements. This could take a while.


ERROR: Could not find a version that satisfies the requirement protobuf<=3.20.1,>=3.11.0 (from mysql-connector-python) (from versions: none)
ERROR: No matching distribution found for protobuf<=3.20.1,>=3.11.0


In [1]:
import pandas as pd
import mysql.connector

In [2]:
from mysql.connector import errors

In [3]:
conn = mysql.connector.connect(
host='localhost',
user='root',
password='Ravi@123',
database='Healthcare')

query = 'select * from healthcare_data'
df = pd.read_sql(query, conn)
df.head()



Unnamed: 0,Patient_ID,Age,Ethnicity,Socioeconomic_Status,Geographic_Location,Insurance_Status,Family_Medical_History,Past_Medical_Condition,Previous_Surgeries,Allergies,...,Substance_Use,Stress_Levels,Medication_Type,Dosage,Side_Effects,Symptoms,Mental_Health,Social_Support,Quality_of_Life,Cost_of_Treatment
0,1,46.636,White,High,Urban,No,Diabetes,,No,Dust,...,Alcohol,Low,Albuterol,100mg,Headache,Chest Pain,Good,Low,Poor,1441.0
1,2,67.0,Asian,High,Urban,No,Hypertension,Hypertension,No,Dust,...,,Low,Albuterol,20mg,Nausea,Cough,Good,Low,Poor,1995.0
2,3,84.0,Asian,Low,Rural,Yes,Diabetes,,No,,...,,Moderate,Lisinopril,40u,Nausea,Fatigue,Fair,Moderate,Poor,618.0
3,4,29.0,Asian,Middle,Urban,No,Asthma,Diabetes,No,Penicillin,...,Smoking,High,Insulin,100mg,Nausea,Fatigue,Fair,High,Poor,1033.0
4,5,41.0,Hispanic,Low,Urban,No,Diabetes,,No,,...,Smoking,Low,Metformin,90mcg,,Fatigue,Excellent,High,Fair,1637.0


In [15]:
df.to_csv("C:/Mentorship/New_Healthcare_data.csv", index=False)

In [4]:
df.columns

Index(['Patient_ID', 'Age', 'Ethnicity', 'Socioeconomic_Status',
       'Geographic_Location', 'Insurance_Status', 'Family_Medical_History',
       'Past_Medical_Condition', 'Previous_Surgeries', 'Allergies',
       'Dietary_Habits', 'Physical_Activity', 'Sleep_Patterns',
       'Substance_Use', 'Stress_Levels', 'Medication_Type', 'Dosage',
       'Side_Effects', 'Symptoms', 'Mental_Health', 'Social_Support',
       'Quality_of_Life', 'Cost_of_Treatment'],
      dtype='object')