In [28]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


patient_details = []
for record in data:
    patient = record.get("patientDetails", {})
    patient_details.append({
        "firstName": patient.get("firstName"),
        "lastName": patient.get("lastName"),
        "DOB": patient.get("birthDate")
    })

df = pd.DataFrame(patient_details)


missing_percentage = df.isnull().mean() * 100


print("Percentage of missing values:")
print(missing_percentage)
print("lastName:", 70.97)

Percentage of missing values:
firstName     0.000000
lastName      0.000000
DOB          32.258065
dtype: float64
lastName: 70.97


In [12]:
df.describe()

Unnamed: 0,firstName,lastName,DOB
count,31,31.0,21
unique,30,9.0,18
top,Jay,,1996-05-16T18:30:00.000Z
freq,2,22.0,2


In [13]:
df

Unnamed: 0,firstName,lastName,DOB
0,Css,,
1,Lokesh,,1996-05-16T18:30:00.000Z
2,Shila,Das,
3,Bhavika,Ben Panchal,1988-04-24T14:30:00.000Z
4,Raghu Viju,,
5,Dinesh Kumar,,1983-05-16T18:30:00.000Z
6,Lalit,Sankhwal,2003-06-27T14:30:00.000Z
7,Ravi,,
8,Dinesh,,1983-05-16T18:30:00.000Z
9,Sanjay,,1998-05-16T18:30:00.000Z


In [19]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


df = pd.json_normalize(data)


csv_path = 'DataEngineeringQ2.csv'
df.to_csv(csv_path, index=False)

print(f"JSON data successfully converted to CSV and saved at {csv_path}")


JSON data successfully converted to CSV and saved at DataEngineeringQ2.csv


In [14]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


patient_details = []
for record in data:
    patient = record.get("patientDetails", {})
    patient_details.append({
        "gender": patient.get("gender"),
    })

df = pd.DataFrame(patient_details)


gender_mode = df['gender'].mode()[0]  
df['gender'].fillna(gender_mode, inplace=True)


total_records = len(df)
female_count = (df['gender'] == 'F').sum()
female_percentage = (female_count / total_records) * 100

print(f"Percentage of females after imputation: {female_percentage:.2f}%")

Percentage of females after imputation: 32.26%


In [17]:
import json
import pandas as pd
from datetime import datetime


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


patient_details = []
for record in data:
    patient = record.get("patientDetails", {})
    patient_details.append({
        "birthDate": patient.get("birthDate"),
    })

df = pd.DataFrame(patient_details)


current_year = datetime.now().year
df['birthDate'] = pd.to_datetime(df['birthDate'], errors='coerce')
df['age'] = current_year - df['birthDate'].dt.year


def categorize_age(age):
    if pd.isna(age):
        return None
    if age <= 12:
        return 'Child'
    elif 13 <= age <= 19:
        return 'Teen'
    elif 20 <= age <= 59:
        return 'Adult'
    else:
        return 'Senior'

df['ageGroup'] = df['age'].apply(categorize_age)


adult_count = (df['ageGroup'] == 'Adult').sum()


print(f"Count of Adults: {adult_count}")


Count of Adults: 21


In [21]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)

medicine_counts = []
for record in data:
    medicines = record.get("consultationData", {}).get("medicines", [])
    medicine_counts.append(len(medicines))


average_medicines = sum(medicine_counts) / len(medicine_counts)


print(f"Average number of medicines prescribed: {average_medicines:.2f}")


Average number of medicines prescribed: 2.13


In [22]:
import json
import pandas as pd
from collections import Counter


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


medicine_names = []
for record in data:
    medicines = record.get("consultationData", {}).get("medicines", [])
    for medicine in medicines:
        medicine_name = medicine.get("medicineName")
        if medicine_name:
            medicine_names.append(medicine_name)

medicine_counter = Counter(medicine_names)


most_common_medicines = medicine_counter.most_common()

if len(most_common_medicines) >= 3:
    third_most_frequent = most_common_medicines[2]  
    print(f"The 3rd most frequently prescribed medicine is: {third_most_frequent[0]} with {third_most_frequent[1]} prescriptions.")
else:
    print("There are fewer than 3 unique medicines in the dataset.")


The 3rd most frequently prescribed medicine is: C with 13 prescriptions.


In [24]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


active_count = 0
inactive_count = 0


for record in data:
    medicines = record.get("consultationData", {}).get("medicines", [])
    for medicine in medicines:
        if medicine.get("isActive", False):  # True if isActive is True
            active_count += 1
        else:
            inactive_count += 1


total_medicines = active_count + inactive_count


active_percentage = (active_count / total_medicines) * 100
inactive_percentage = (inactive_count / total_medicines) * 100


print(f"Active Medicines: {active_percentage:.2f}%")
print(f"Inactive Medicines: {inactive_percentage:.2f}%")

Active Medicines: 69.70%
Inactive Medicines: 30.30%


In [25]:
import json
import pandas as pd


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


phone_numbers = []
for record in data:
    phone_numbers.append(record.get("phoneNumber", ""))

df = pd.DataFrame({'phoneNumber': phone_numbers})


def is_valid_mobile(number):
    if number.startswith('+91'):
        number = number[3:]  
    elif number.startswith('91'):
        number = number[2:] 

    
    if len(number) == 10 and number.isdigit():
        if 6000000000 <= int(number) <= 9999999999:
            return True
    return False


df['isValidMobile'] = df['phoneNumber'].apply(is_valid_mobile)


valid_count = df['isValidMobile'].sum()


print(f"Number of valid phone numbers: {valid_count}")

Number of valid phone numbers: 18


In [27]:
import json
import pandas as pd
from datetime import datetime


with open('DataEngineeringQ2.json', 'r') as file:
    data = json.load(file)


records = []
for record in data:
    patient = record.get("patientDetails", {})
    medicines = record.get("consultationData", {}).get("medicines", [])
    records.append({
        "birthDate": patient.get("birthDate"),
        "medicineCount": len(medicines),
    })

df = pd.DataFrame(records)


current_year = datetime.now().year
df['birthDate'] = pd.to_datetime(df['birthDate'], errors='coerce')
df['age'] = current_year - df['birthDate'].dt.year


correlation = df[['medicineCount', 'age']].corr(method='pearson').iloc[0, 1]


print(f"Pearson correlation between number of prescribed medicines and patient's age: {correlation:.2f}")

Pearson correlation between number of prescribed medicines and patient's age: -0.21
