In [9]:
import pandas as pd

# Load datasets
medical_df = pd.read_csv("medical_records.csv")
patient_df = pd.read_csv("patient_details.csv")

print(medical_df.head())
print(patient_df.head())


  record_id patient_id     diagnosis      bp  sugar_level  visit_cost
0      R101       P013  Hypertension  120/80        140.0        2000
1      R102       P105      Diabetes  120/80          NaN        2500
2      R103       P098  Hypertension    high        140.0        3000
3      R104       P038        Asthma  140/90          NaN        3000
4      R105       P040        Asthma  130/85          NaN        1500
  patient_id          name   age gender     city admission_date height_cm
0       P001           NaN  25.0      F   mumbai     15/01/2024       180
1       P002  Rahul Sharma  60.0      F    Delhi     2024/01/16       170
2       P003    Neha Singh  45.0      M   Mumbai     2024/01/16       150
3       P004    Neha Singh  60.0   Male  kolkata     15/01/2024       170
4       P005    Neha Singh  25.0   Male  kolkata     2024-01-15       180


In [10]:
import pandas as pd

medical_df = pd.read_csv("medical_records.csv")
patient_df = pd.read_csv("patient_details.csv")

medical_df['bp'] = medical_df['bp'].replace('high', pd.NA)

medical_df[['systolic_bp', 'diastolic_bp']] = medical_df['bp'].str.split('/', expand=True)

medical_df['systolic_bp'] = pd.to_numeric(medical_df['systolic_bp'], errors='coerce')
medical_df['diastolic_bp'] = pd.to_numeric(medical_df['diastolic_bp'], errors='coerce')

medical_df['sugar_level'] = pd.to_numeric(medical_df['sugar_level'], errors='coerce')
medical_df['sugar_level'].fillna(medical_df['sugar_level'].mean(), inplace=True)

medical_df.drop(columns=['bp'], inplace=True)


patient_df['name'].fillna("Unknown", inplace=True)

patient_df['gender'] = patient_df['gender'].replace({
    'M': 'Male',
    'F': 'Female',
    'male': 'Male',
    'female': 'Female'
})

patient_df['city'] = patient_df['city'].str.title()


patient_df['admission_date'] = pd.to_datetime(
    patient_df['admission_date'],
    errors='coerce',
    dayfirst=True
)


final_df = pd.merge(
    medical_df,
    patient_df,
    on='patient_id',
    how='left'
)


final_df.to_csv("final_merged_dataset.csv", index=False)


print("Data cleaned and merged successfully!")
print(final_df.head())
print(final_df.info())


Data cleaned and merged successfully!
  record_id patient_id     diagnosis  sugar_level  visit_cost  systolic_bp  \
0      R101       P013  Hypertension   140.000000        2000        120.0   
1      R102       P105      Diabetes   141.058824        2500        120.0   
2      R103       P098  Hypertension   140.000000        3000          NaN   
3      R104       P038        Asthma   141.058824        3000        140.0   
4      R105       P040        Asthma   141.058824        1500        130.0   

   diastolic_bp        name   age  gender     city admission_date height_cm  
0          80.0   John Paul  25.0  Female    Delhi     2024-01-15       abc  
1          80.0   John Paul   NaN  Female   Mumbai     2024-01-15       170  
2           NaN   John Paul  60.0  Female  Kolkata            NaT       170  
3          90.0     Unknown  60.0    Male   Mumbai            NaT       170  
4          85.0  Neha Singh   NaN  Female  Chennai            NaT       150  
<class 'pandas.core.frame

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  medical_df['sugar_level'].fillna(medical_df['sugar_level'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  patient_df['name'].fillna("Unknown", inplace=True)


In [14]:
print(medical_df.columns)



Index(['record_id', 'patient_id', 'diagnosis', 'sugar_level', 'visit_cost',
       'systolic_bp', 'diastolic_bp'],
      dtype='object')


In [12]:
# Fill missing names
patient_df['name'] = patient_df['name'].fillna("Unknown")

# Standardize gender
patient_df['gender'] = patient_df['gender'].replace({
    'M': 'Male',
    'F': 'Female',
    'male': 'Male',
    'female': 'Female'
})

# Standardize city names
patient_df['city'] = patient_df['city'].str.title()

# Convert admission_date to datetime
patient_df['admission_date'] = pd.to_datetime(
    patient_df['admission_date'],
    errors='coerce',
    dayfirst=True
)
