In [1]:
import json
import pandas as pd

# Load the JSON data
file_path = '/content/DataEngineeringQ2.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Extract patientDetails
patient_details = [entry['patientDetails'] for entry in data]

data


[{'_id': 'T6hf3rb5',
  'appointmentId': '40d2-9c9f',
  'patientDetails': {'_id': 'T6hb630b3',
   'firstName': 'Css',
   'lastName': '',
   'emailId': '',
   'gender': '',
   'alternateContact': '',
   'birthDate': None},
  'phoneNumber': '96686896670',
  'consultationData': {'adviceTemplates': [],
   'advices': [],
   'attachments': [],
   'chiefComplaints': [],
   'customOne': [],
   'customThree': [],
   'customTwo': [],
   'disease': [],
   'doctorNotes': '',
   'emergencyInstructions': [],
   'emergencyInstructionsTemplate': [],
   'emrTemplates': [],
   'examinationNote': [],
   'findings': [],
   'investigationInstructions': [],
   'investigationTemplates': [],
   'investigations': [],
   'isBalicAppointment': False,
   'isQuickPrescription': False,
   'labTest': [],
   'languageCode': 'en',
   'medicineTemplates': [],
   'medicines': [{'medicineId': '619404',
     'medicineName': 'A',
     'medicineNameStrengthType': '',
     'frequency': '1-0-1',
     'duration': '90',
     'du

In [2]:
# Convert to DataFrame
df = pd.json_normalize(patient_details)
df

Unnamed: 0,_id,firstName,lastName,emailId,gender,alternateContact,birthDate,phrId,customId,chat._id,chat.channelId
0,T6hb630b3,Css,,,,,,,,,
1,T6h33b300,Lokesh,,,M,,1996-05-16T18:30:00.000Z,,,,
2,TjhB4373,Shila,Das,,,,,63b5hvy614d5,,,
3,6df4R5b,Bhavika,Ben Panchal,,F,,1988-04-24T14:30:00.000Z,644nig7y,,,
4,lK9hy06a,Raghu Viju,,,,,,,,,
5,kI9d5c6,Dinesh Kumar,,,M,,1983-05-16T18:30:00.000Z,,3FPEUCW8HACJ,09hUb5c7,
6,i9R324b7,Lalit,Sankhwal,,M,,2003-06-27T14:30:00.000Z,67Yt2b97,,,
7,T6h41b5ad3d8002ad9c3c4,Ravi,,,,,,,8AV4ICI5I389,T6h41b5ad3d8002ad9c3c5,
8,T9Iy347e,Dinesh,,,M,,1983-05-16T18:30:00.000Z,,0308MSAHYR8K,T9IsT37f,
9,T4Rb1a2,Sanjay,,,M,,1998-05-16T18:30:00.000Z,,8S7A3D4K4WZH,7Yd9b1a3,


In [3]:
missing_percentages = df[['firstName', 'lastName', 'birthDate']].isna().mean() * 100

missing_percentages += (df[['firstName', 'lastName', 'birthDate']] == '').mean() * 100

missing_percentages = missing_percentages.round(2)

missing_percentages

firstName     0.00
lastName     70.97
birthDate    32.26
dtype: float64

In [4]:
# Calculate the mode of the gender column
gender_mode = df['gender'].mode()[0]

df['gender'].fillna(gender_mode, inplace=True)
df['gender'].replace('', gender_mode, inplace=True)

# Calculate the percentage of female gender
female_percentage = (df['gender'] == 'F').mean() * 100

female_percentage = round(female_percentage, 2)

female_percentage

32.26

In [15]:
from datetime import datetime

def calculate_age(birthdate):
    if pd.isna(birthdate) or birthdate == '':
        return None
    birthdate = datetime.strptime(birthdate, '%Y-%m-%dT%H:%M:%S.%fZ')
    today = datetime.today()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

df['age'] = df['birthDate'].apply(calculate_age)

df['age']

0      NaN
1     28.0
2      NaN
3     36.0
4      NaN
5     41.0
6     21.0
7      NaN
8     41.0
9     26.0
10    36.0
11    53.0
12    54.0
13    26.0
14     NaN
15    27.0
16    52.0
17    23.0
18     NaN
19     NaN
20    22.0
21    25.0
22    28.0
23     NaN
24    32.0
25     NaN
26     NaN
27    25.0
28    52.0
29    44.0
30    32.0
Name: age, dtype: float64

In [6]:
def age_group(age):
    if age is None:
        return None
    if age <= 12:
        return 'Child'
    elif age <= 19:
        return 'Teen'
    elif age <= 59:
        return 'Adult'
    else:
        return 'Senior'

df['ageGroup'] = df['age'].apply(age_group)

adult_count = (df['ageGroup'] == 'Adult').sum()

print(adult_count)

21


In [9]:
# Extract the number of medicines prescribed for each entry
medicines_count = [len(entry['consultationData']['medicines']) for entry in data]

average_medicines = sum(medicines_count) / len(medicines_count)
average_medicines = round(average_medicines, 2)

average_medicines

2.13

In [13]:
from collections import Counter

# Extract all prescribed medicines
all_medicines = []
for entry in data:
    for medicine in entry['consultationData']['medicines']:
        all_medicines.append(medicine['medicineName'])


all_medicines


['A',
 'B',
 'A',
 'C',
 'B',
 'A',
 'C',
 'B',
 'D',
 'A',
 'A',
 'D',
 'E',
 'C',
 'D',
 'E',
 'A',
 'E',
 'E',
 'E',
 'A',
 'D',
 'A',
 'B',
 'D',
 'B',
 'D',
 'C',
 'C',
 'A',
 'A',
 'C',
 'B',
 'A',
 'B',
 'C',
 'D',
 'D',
 'D',
 'D',
 'B',
 'A',
 'G',
 'B',
 'D',
 'A',
 'B',
 'C',
 'D',
 'A',
 'E',
 'C',
 'C',
 'D',
 'A',
 'C',
 'C',
 'D',
 'A',
 'B',
 'C',
 'D',
 'A',
 'D',
 'D',
 'B']

In [14]:
medicine_counts = Counter(all_medicines)

# Find the 3rd most frequent medicine
third_most_common = medicine_counts.most_common(3)[2][0]

print(third_most_common)


C


In [16]:

# Extract all prescribed medicines with their status
all_medicines_with_status = []
for entry in data:
    for medicine in entry['consultationData']['medicines']:
        all_medicines_with_status.append((medicine['medicineName'], medicine['isActive']))

all_medicines_with_status

[('A', True),
 ('B', False),
 ('A', True),
 ('C', False),
 ('B', True),
 ('A', False),
 ('C', True),
 ('B', True),
 ('D', True),
 ('A', True),
 ('A', True),
 ('D', False),
 ('E', True),
 ('C', False),
 ('D', True),
 ('E', True),
 ('A', True),
 ('E', True),
 ('E', False),
 ('E', True),
 ('A', True),
 ('D', False),
 ('A', True),
 ('B', True),
 ('D', False),
 ('B', True),
 ('D', True),
 ('C', True),
 ('C', True),
 ('A', True),
 ('A', True),
 ('C', False),
 ('B', True),
 ('A', False),
 ('B', True),
 ('C', True),
 ('D', True),
 ('D', False),
 ('D', False),
 ('D', False),
 ('B', True),
 ('A', True),
 ('G', False),
 ('B', True),
 ('D', True),
 ('A', True),
 ('B', True),
 ('C', True),
 ('D', True),
 ('A', True),
 ('E', True),
 ('C', False),
 ('C', False),
 ('D', False),
 ('A', False),
 ('C', True),
 ('C', False),
 ('D', True),
 ('A', True),
 ('B', True),
 ('C', True),
 ('D', True),
 ('A', True),
 ('D', True),
 ('D', False),
 ('B', True)]

In [17]:
status_counts = Counter(status for _, status in all_medicines_with_status)

# Calculate percentages
total_medicines = len(all_medicines_with_status)
active_percentage = (status_counts[True] / total_medicines) * 100
inactive_percentage = (status_counts[False] / total_medicines) * 100

print("Percentage of active medicines: {:.2f}%".format(active_percentage))
print("Percentage of inactive medicines: {:.2f}%".format(inactive_percentage))


Percentage of active medicines: 69.70%
Percentage of inactive medicines: 30.30%


In [23]:
all_phone_numbers = []
for entry in data:
    for phone in entry['patientDetails']['_id']:
        all_phone_numbers.append(phone['phoneNumber'])

all_phone_numbers

TypeError: string indices must be integers

[]