# Healthcare Patient Records
### Problem Statement:
    -Clean and explore patient data. Handle missing age/gender values, calculate bill stats, and analyze diagnosis patterns over time.
    -Key Techniques: .dropna(), .replace(), .mean(), .apply(), .astype(), np.select(), np.isnan()

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('healthcare_patients.csv')
df = df.copy()
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55.0,,2024-01-01,,Malaria,
1,5002,55.0,Female,2024-01-03,Fever,Flu,
2,5003,55.0,,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45.0,Male,2024-01-07,,Typhoid,7500.0
4,5005,55.0,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55.0,Male,2024-01-11,,Typhoid,5000.0
6,5007,,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55.0,,2024-01-15,Fever,Malaria,11000.0
8,5009,45.0,,2024-01-17,,Malaria,9000.0
9,5010,,Male,2024-01-19,Fatigue,Flu,5000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      50 non-null     int64  
 1   Age             40 non-null     float64
 2   Gender          32 non-null     object 
 3   Admission_Date  50 non-null     object 
 4   Symptoms        37 non-null     object 
 5   Diagnosis       50 non-null     object 
 6   Bill_Amount     39 non-null     float64
dtypes: float64(2), int64(1), object(4)
memory usage: 2.9+ KB


In [5]:
df.isnull().sum()

Patient_ID         0
Age               10
Gender            18
Admission_Date     0
Symptoms          13
Diagnosis          0
Bill_Amount       11
dtype: int64

In [6]:
df['Age'].value_counts()

Age
55.0    13
25.0    10
45.0     9
35.0     8
Name: count, dtype: int64

In [7]:
df['Age'].mean()

np.float64(41.25)

In [11]:
df.fillna({"Age": df['Age'].mean().round()}, inplace=True)

In [12]:
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55.0,,2024-01-01,,Malaria,
1,5002,55.0,Female,2024-01-03,Fever,Flu,
2,5003,55.0,,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45.0,Male,2024-01-07,,Typhoid,7500.0
4,5005,55.0,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55.0,Male,2024-01-11,,Typhoid,5000.0
6,5007,41.0,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55.0,,2024-01-15,Fever,Malaria,11000.0
8,5009,45.0,,2024-01-17,,Malaria,9000.0
9,5010,41.0,Male,2024-01-19,Fatigue,Flu,5000.0


In [13]:
df.isnull().sum()

Patient_ID         0
Age                0
Gender            18
Admission_Date     0
Symptoms          13
Diagnosis          0
Bill_Amount       11
dtype: int64

In [14]:
df['Gender'].value_counts()

Gender
Female    16
Male      16
Name: count, dtype: int64

In [15]:
#replacing null values randomly with male or female in gender column
null_index = df[df['Gender'].isnull()].index    #finds indesx with null values

In [17]:
df.loc[null_index, 'Gender'] = np.random.choice(['Male','Female'], size=len(null_index))   #fills values one at a time randomly

In [18]:
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55.0,Female,2024-01-01,,Malaria,
1,5002,55.0,Female,2024-01-03,Fever,Flu,
2,5003,55.0,Female,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45.0,Male,2024-01-07,,Typhoid,7500.0
4,5005,55.0,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55.0,Male,2024-01-11,,Typhoid,5000.0
6,5007,41.0,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55.0,Female,2024-01-15,Fever,Malaria,11000.0
8,5009,45.0,Female,2024-01-17,,Malaria,9000.0
9,5010,41.0,Male,2024-01-19,Fatigue,Flu,5000.0


In [19]:
df.isnull().sum()

Patient_ID         0
Age                0
Gender             0
Admission_Date     0
Symptoms          13
Diagnosis          0
Bill_Amount       11
dtype: int64

In [20]:
df['Symptoms'].value_counts()

Symptoms
Cough      11
Fatigue    10
Fever       8
Nausea      8
Name: count, dtype: int64

In [24]:
df['Symptoms'].dropna().unique()

array(['Fever', 'Fatigue', 'Nausea', 'Cough'], dtype=object)

In [25]:
null_index = df[df['Symptoms'].isnull()].index
df.loc[null_index, 'Symptoms'] = np.random.choice(df['Symptoms'].dropna().unique(), size=len(null_index))

In [26]:
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55.0,Female,2024-01-01,Cough,Malaria,
1,5002,55.0,Female,2024-01-03,Fever,Flu,
2,5003,55.0,Female,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45.0,Male,2024-01-07,Nausea,Typhoid,7500.0
4,5005,55.0,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55.0,Male,2024-01-11,Fever,Typhoid,5000.0
6,5007,41.0,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55.0,Female,2024-01-15,Fever,Malaria,11000.0
8,5009,45.0,Female,2024-01-17,Nausea,Malaria,9000.0
9,5010,41.0,Male,2024-01-19,Fatigue,Flu,5000.0


In [27]:
df.isnull().sum()

Patient_ID         0
Age                0
Gender             0
Admission_Date     0
Symptoms           0
Diagnosis          0
Bill_Amount       11
dtype: int64

In [28]:
df.fillna({"Bill_Amount": df['Bill_Amount'].mean().round()}, inplace=True)

In [30]:
df.isnull().sum()

Patient_ID        0
Age               0
Gender            0
Admission_Date    0
Symptoms          0
Diagnosis         0
Bill_Amount       0
dtype: int64

In [31]:
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55.0,Female,2024-01-01,Cough,Malaria,7731.0
1,5002,55.0,Female,2024-01-03,Fever,Flu,7731.0
2,5003,55.0,Female,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45.0,Male,2024-01-07,Nausea,Typhoid,7500.0
4,5005,55.0,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55.0,Male,2024-01-11,Fever,Typhoid,5000.0
6,5007,41.0,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55.0,Female,2024-01-15,Fever,Malaria,11000.0
8,5009,45.0,Female,2024-01-17,Nausea,Malaria,9000.0
9,5010,41.0,Male,2024-01-19,Fatigue,Flu,5000.0


In [33]:
df["Age"] = df["Age"].astype(int)

In [34]:
df

Unnamed: 0,Patient_ID,Age,Gender,Admission_Date,Symptoms,Diagnosis,Bill_Amount
0,5001,55,Female,2024-01-01,Cough,Malaria,7731.0
1,5002,55,Female,2024-01-03,Fever,Flu,7731.0
2,5003,55,Female,2024-01-05,Fatigue,COVID-19,9000.0
3,5004,45,Male,2024-01-07,Nausea,Typhoid,7500.0
4,5005,55,Male,2024-01-09,Nausea,Flu,9000.0
5,5006,55,Male,2024-01-11,Fever,Typhoid,5000.0
6,5007,41,Male,2024-01-13,Nausea,COVID-19,11000.0
7,5008,55,Female,2024-01-15,Fever,Malaria,11000.0
8,5009,45,Female,2024-01-17,Nausea,Malaria,9000.0
9,5010,41,Male,2024-01-19,Fatigue,Flu,5000.0


In [35]:
df.to_csv("cleaned_hospital_data.csv", index = False)