In [28]:
import json
import pandas as pd

file_path = 'Data.json'
with open(file_path, 'r') as file:
    data = json.load(file)

patient_details = []
for record in data:
    patient = record.get("patientDetails", {})
    patient_details.append({
        "firstName": patient.get("firstName"),
        "lastName": patient.get("lastName"),
        "birthDate": patient.get("birthDate")
    })

df = pd.DataFrame(patient_details)

df = df.replace("", pd.NA)

missing_percentage = df.isnull().mean() * 100

result = f"{missing_percentage['firstName']:.2f}, {missing_percentage['lastName']:.2f}, {missing_percentage['birthDate']:.2f}"
print(result)

Percentage of missing values:
firstName     0.000000
lastName      0.000000
DOB          32.258065
dtype: float64
lastName: 70.97


In [12]:
df.describe()

Unnamed: 0,firstName,lastName,DOB
count,31,31.0,21
unique,30,9.0,18
top,Jay,,1996-05-16T18:30:00.000Z
freq,2,22.0,2


In [13]:
df

Unnamed: 0,firstName,lastName,DOB
0,Css,,
1,Lokesh,,1996-05-16T18:30:00.000Z
2,Shila,Das,
3,Bhavika,Ben Panchal,1988-04-24T14:30:00.000Z
4,Raghu Viju,,
5,Dinesh Kumar,,1983-05-16T18:30:00.000Z
6,Lalit,Sankhwal,2003-06-27T14:30:00.000Z
7,Ravi,,
8,Dinesh,,1983-05-16T18:30:00.000Z
9,Sanjay,,1998-05-16T18:30:00.000Z


In [19]:
import json
import pandas as pd

with open('DataEngineeringQ2.json', 'r') as json_file:
    json_data = json.load(json_file)

data_frame = pd.json_normalize(json_data)

csv_output_path = 'DataEngineeringQ2.csv'

data_frame.to_csv(csv_output_path, index=False)

print(f"JSON data has been successfully converted to CSV and stored at {csv_output_path}")


JSON data successfully converted to CSV and saved at DataEngineeringQ2.csv


In [14]:
import json
import pandas as pd

with open('DataEngineeringQ2.json', 'r') as json_file:
    records = json.load(json_file)

gender_data = []
for entry in records:
    patient_info = entry.get("patientDetails", {})
    gender_data.append({
        "sex": patient_info.get("gender"),
    })

data_frame = pd.DataFrame(gender_data)

most_common_gender = data_frame['sex'].mode()[0]
data_frame['sex'].fillna(most_common_gender, inplace=True)

total_entries = len(data_frame)
female_count = (data_frame['sex'] == 'F').sum()
female_percentage = (female_count / total_entries) * 100

print(f"Percentage of females after imputation: {female_percentage:.2f}%")


Percentage of females after imputation: 32.26%


In [17]:
import json
import pandas as pd
from datetime import datetime

with open('DataEngineeringQ2.json', 'r') as json_file:
    json_data = json.load(json_file)

patient_info = []
for entry in json_data:
    patient = entry.get("patientDetails", {})
    patient_info.append({
        "birthDate": patient.get("birthDate"),
    })

data_frame = pd.DataFrame(patient_info)

current_year = datetime.now().year
data_frame['birthDate'] = pd.to_datetime(data_frame['birthDate'], errors='coerce')
data_frame['age'] = current_year - data_frame['birthDate'].dt.year

def determine_age_group(age):
    if pd.isna(age):
        return None
    if age <= 12:
        return 'Child'
    elif 13 <= age <= 19:
        return 'Teen'
    elif 20 <= age <= 59:
        return 'Adult'
    else:
        return 'Senior'

data_frame['ageGroup'] = data_frame['age'].apply(determine_age_group)

adult_count = (data_frame['ageGroup'] == 'Adult').sum()

print(f"Number of adults: {adult_count}")


Count of Adults: 21


In [21]:
import json
import pandas as pd

with open('DataEngineeringQ2.json', 'r') as json_file:
    records = json.load(json_file)

medication_count_list = []
for record in records:
    medications = record.get("consultationData", {}).get("medicines", [])
    medication_count_list.append(len(medications))

average_medications = sum(medication_count_list) / len(medication_count_list)

print(f"Average number of medications prescribed: {average_medications:.2f}")


Average number of medicines prescribed: 2.13


In [22]:
import json
import pandas as pd
from collections import Counter

with open('DataEngineeringQ2.json', 'r') as json_file:
    json_data = json.load(json_file)

medication_list = []
for entry in json_data:
    medicines = entry.get("consultationData", {}).get("medicines", [])
    for medicine in medicines:
        medicine_name = medicine.get("medicineName")
        if medicine_name:
            medication_list.append(medicine_name)

medicine_frequency = Counter(medication_list)

top_medicines = medicine_frequency.most_common()

if len(top_medicines) >= 3:
    third_most_common = top_medicines[2]
    print(f"The 3rd most common medicine prescribed is: {third_most_common[0]} with {third_most_common[1]} occurrences.")
else:
    print("There are less than 3 unique medicines in the dataset.")


The 3rd most frequently prescribed medicine is: C with 13 prescriptions.


In [24]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


active_count = 0
inactive_count = 0


for record in data:
    medicines = record.get("consultationData", {}).get("medicines", [])
    for medicine in medicines:
        if medicine.get("isActive", False):  # True if isActive is True
            active_count += 1
        else:
            inactive_count += 1


total_medicines = active_count + inactive_count


active_percentage = (active_count / total_medicines) * 100
inactive_percentage = (inactive_count / total_medicines) * 100


print(f"Active Medicines: {active_percentage:.2f}%")
print(f"Inactive Medicines: {inactive_percentage:.2f}%")

Active Medicines: 69.70%
Inactive Medicines: 30.30%


In [25]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


phone_numbers = []
for record in data:
    phone_numbers.append(record.get("phoneNumber", ""))

df = pd.DataFrame({'phoneNumber': phone_numbers})


def is_valid_mobile(number):
    if number.startswith('+91'):
        number = number[3:]  
    elif number.startswith('91'):
        number = number[2:] 

    
    if len(number) == 10 and number.isdigit():
        if 6000000000 <= int(number) <= 9999999999:
            return True
    return False


df['isValidMobile'] = df['phoneNumber'].apply(is_valid_mobile)


valid_count = df['isValidMobile'].sum()


print(f"Number of valid phone numbers: {valid_count}")

Number of valid phone numbers: 18


In [27]:
import json
import pandas as pd
from datetime import datetime


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


records = []
for record in data:
    patient = record.get("patientDetails", {})
    medicines = record.get("consultationData", {}).get("medicines", [])
    records.append({
        "birthDate": patient.get("birthDate"),
        "medicineCount": len(medicines),
    })

df = pd.DataFrame(records)


current_year = datetime.now().year
df['birthDate'] = pd.to_datetime(df['birthDate'], errors='coerce')
df['age'] = current_year - df['birthDate'].dt.year


correlation = df[['medicineCount', 'age']].corr(method='pearson').iloc[0, 1]


print(f"Pearson correlation between number of prescribed medicines and patient's age: {correlation:.2f}")

Pearson correlation between number of prescribed medicines and patient's age: -0.21
