In [2]:
import pandas as pd

# Load the dataset
file_path = "healthcare_recordd.csv.zip"
df = pd.read_csv(file_path)

# Display basic information
df.info()

# Show the first few rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [3]:
def clean_name(name):
    return ' '.join([w.capitalize() for w in name.split()])

df['Name_Cleaned'] = df['Name'].apply(clean_name)


In [4]:
# Common typo fixes and normalization
condition_map = {
    'diabtes': 'diabetes',
    'obesity': 'obesity',
    'cancer': 'cancer',
    'hypertension': 'hypertension',
    'hypertnsion': 'hypertension',
    'cardio vascular disease': 'cardiovascular disease',
    'cardio-vascular disease': 'cardiovascular disease',
    'asthama': 'asthma',
    'asthma': 'asthma',
    'diabetes': 'diabetes',
}


In [5]:
import re

def normalize_condition(text):
    text = text.lower().strip()
    text = re.sub(r'[^a-z\s-]', '', text)
    return condition_map.get(text, text)  # If not in dict, return cleaned version

df['Medical_Condition_Cleaned'] = df['Medical Condition'].apply(normalize_condition)


In [6]:
import pandas as pd

# Sample noisy and correct conditions for training
train_data = pd.DataFrame({
    'input': ['diabtes', 'obesity', 'hypertnsion', 'cancer', 'asthama', 'cardio vascular disease'],
    'label': ['diabetes', 'obesity', 'hypertension', 'cancer', 'asthma', 'cardiovascular disease']
})


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

model.fit(train_data['input'], train_data['label'])


In [8]:
df['Medical_Condition_ML_Predicted'] = model.predict(df['Medical Condition'].apply(lambda x: x.lower()))


In [9]:
df_cleaned = df[['Name', 'Name_Cleaned', 'Medical Condition', 'Medical_Condition_Cleaned', 'Medical_Condition_ML_Predicted']]
df_cleaned.to_csv("cleaned_healthcare_records.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_healthcare_records.csv")


✅ Cleaned dataset saved as cleaned_healthcare_records.csv
