In [1]:
import json
import pandas as pd

def analyze_json_file(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)

    if not isinstance(data, dict):
        print("The top level of the JSON file is not a dictionary.")
        return

    level_1_keys_count = len(data)
    print("Number of keys in level 1 dictionary:", level_1_keys_count)

    # Initialize counters and the list for keys meeting all criteria
    counts = {
        "active_ingredients": 0,
        "disease_information": 0,
        "additional_information": 0,
        "all_criteria": 0
    }
    all_criteria_keys = []

    for key, value in data.items():
        if not isinstance(value, dict):
            continue

        ai_valid = "Active Ingredients" in value and \
                   any(isinstance(v, str) and v for v in value["Active Ingredients"].values() if not pd.isna(v))
        
        di_valid = "Disease Information" in value and \
                   isinstance(value["Disease Information"], dict) and \
                   len(value["Disease Information"]) > 0

        ai_info_valid = "Additional Information" in value and \
                        isinstance(value["Additional Information"], dict) and \
                        len(value["Additional Information"]) > 0

        # Update counters
        counts["active_ingredients"] += ai_valid
        counts["disease_information"] += di_valid
        counts["additional_information"] += ai_info_valid

        # Check if all criteria are met
        if ai_valid and di_valid and ai_info_valid:
            counts["all_criteria"] += 1
            all_criteria_keys.append(key)

    # Print results
    print("Number of keys with valid 'Active Ingredients':", counts["active_ingredients"])
    print("Number of keys with valid 'Disease Information':", counts["disease_information"])
    print("Number of keys with valid 'Additional Information':", counts["additional_information"])
    print("Number of keys satisfying all three criteria:", counts["all_criteria"])

    return all_criteria_keys

In [2]:
good_keys = analyze_json_file('scraper/data/drug_info.json')
print('---')
good_keys

Number of keys in level 1 dictionary: 7878
Number of keys with valid 'Active Ingredients': 7283
Number of keys with valid 'Disease Information': 4212
Number of keys with valid 'Additional Information': 4283
Number of keys satisfying all three criteria: 3763
---


['A-HYDROCORT',
 'A-METHAPRED',
 'ABACAVIR SULFATE',
 'ABACAVIR SULFATE; LAMIVUDINE',
 'ABACAVIR; DOLUTEGRAVIR; LAMIVUDINE',
 'ABACAVIR; LAMIVUDINE',
 'ABILIFY',
 'ABILIFY ASIMTUFII',
 'ABIRATERONE ACETATE',
 'ABITREXATE',
 'ABLAVAR',
 'ABRAXANE',
 'ABREVA',
 'ABSORICA',
 'ABSORICA LD',
 'ABSTRAL',
 'ACALABRUTINIB',
 'ACAMPROSATE CALCIUM',
 'ACARBOSE',
 'ACCOLATE',
 'ACCRUFER',
 'ACCUNEB',
 'ACCUPRIL',
 'ACCUTANE',
 'ACEBUTOLOL HYDROCHLORIDE',
 'ACEON',
 'ACEPHEN',
 'ACETADOTE',
 'ACETAMINOPHEN',
 'ACETAMINOPHEN; OXYCODONE HYDROCHLORIDE',
 'ACETASOL',
 'ACETAZOLAMIDE',
 'ACETAZOLAMIDE SODIUM',
 'ACETIC ACID',
 'ACETYLCYSTEINE',
 'ACHROMYCIN',
 'ACHROMYCIN V',
 'ACIPHEX',
 'ACIPHEX SPRINKLE',
 'ACITRETIN',
 'ACLOVATE',
 'ACTH',
 'ACTHAR',
 'ACTHAR GEL',
 'ACTHREL',
 'ACTICLATE',
 'ACTICLATE CAP',
 'ACTICORT',
 'ACTIGALL',
 'ACTIQ',
 'ACTISITE',
 'ACTONEL',
 'ACTOS',
 'ACULAR',
 'ACULAR LS',
 'ACUVAIL',
 'ACYCLOVIR',
 'ACYCLOVIR SODIUM',
 'ACZONE',
 'ADALAT',
 'ADALAT CC',
 'ADAPALENE',


In [3]:
print(good_keys[:1500])

['A-HYDROCORT', 'A-METHAPRED', 'ABACAVIR SULFATE', 'ABACAVIR SULFATE; LAMIVUDINE', 'ABACAVIR; DOLUTEGRAVIR; LAMIVUDINE', 'ABACAVIR; LAMIVUDINE', 'ABILIFY', 'ABILIFY ASIMTUFII', 'ABIRATERONE ACETATE', 'ABITREXATE', 'ABLAVAR', 'ABRAXANE', 'ABREVA', 'ABSORICA', 'ABSORICA LD', 'ABSTRAL', 'ACALABRUTINIB', 'ACAMPROSATE CALCIUM', 'ACARBOSE', 'ACCOLATE', 'ACCRUFER', 'ACCUNEB', 'ACCUPRIL', 'ACCUTANE', 'ACEBUTOLOL HYDROCHLORIDE', 'ACEON', 'ACEPHEN', 'ACETADOTE', 'ACETAMINOPHEN', 'ACETAMINOPHEN; OXYCODONE HYDROCHLORIDE', 'ACETASOL', 'ACETAZOLAMIDE', 'ACETAZOLAMIDE SODIUM', 'ACETIC ACID', 'ACETYLCYSTEINE', 'ACHROMYCIN', 'ACHROMYCIN V', 'ACIPHEX', 'ACIPHEX SPRINKLE', 'ACITRETIN', 'ACLOVATE', 'ACTH', 'ACTHAR', 'ACTHAR GEL', 'ACTHREL', 'ACTICLATE', 'ACTICLATE CAP', 'ACTICORT', 'ACTIGALL', 'ACTIQ', 'ACTISITE', 'ACTONEL', 'ACTOS', 'ACULAR', 'ACULAR LS', 'ACUVAIL', 'ACYCLOVIR', 'ACYCLOVIR SODIUM', 'ACZONE', 'ADALAT', 'ADALAT CC', 'ADAPALENE', 'ADASUVE', 'ADCIRCA', 'ADDYI', 'ADEFOVIR DIPIVOXIL', 'ADEMPAS

In [6]:
print(good_keys[1500:3000])

['HALAVEN', 'HALCINONIDE', 'HALDOL', 'HALOBETASOL PROPIONATE', 'HALOBETASOL PROPIONATE; TAZAROTENE', 'HALOG', 'HALOG-E', 'HALOPERIDOL', 'HALOPERIDOL DECANOATE', 'HALOPERIDOL LACTATE', 'HALOTESTIN', 'HALOTHANE', 'HEATHER', 'HECTOROL', 'HELICOSOL', 'HEMABATE', 'HEMADY', 'HEMANGEOL', 'HEPARIN LOCK FLUSH', 'HEPARIN SODIUM', 'HEPSERA', 'HER STYLE', 'HERPLEX', 'HETLIOZ', 'HETLIOZ LQ', 'HETRAZAN', 'HEXA-BETALIN', 'HEXA-GERM', 'HEXASCRUB', 'HIBICLENS', 'HISERPIA', 'HISPRIL', 'HISTAMINE PHOSPHATE', 'HMS', 'HORIZANT', 'HUMATIN', 'HUMULIN R KWIKPEN', 'HYCAMTIN', 'HYDASE', 'HYDELTRA-TBA', 'HYDELTRASOL', 'HYDERGINE', 'HYDERGINE LC', 'HYDRALAZINE HYDROCHLORIDE', 'HYDRAMINE', 'HYDREA', 'HYDROCHLOROTHIAZIDE', 'HYDROCHLOROTHIAZIDE; IRBESARTAN', 'HYDROCHLOROTHIAZIDE; VALSARTAN', 'HYDROCODONE', 'HYDROCORTISONE', 'HYDROCORTISONE ACETATE', 'HYDROCORTISONE BUTYRATE', 'HYDROCORTISONE SODIUM PHOSPHATE', 'HYDROCORTISONE SODIUM SUCCINATE', 'HYDROCORTISONE VALERATE', 'HYDRODIURIL', 'HYDROFLUMETHIAZIDE', 'HYDROGE

In [7]:
print(good_keys[3000:])

['SEROMYCIN', 'SEROPHENE', 'SEROQUEL', 'SEROQUEL XR', 'SERPANRAY', 'SERTRALINE HYDROCHLORIDE', 'SERZONE', 'SESQUIENT', 'SEVELAMER CARBONATE', 'SEVELAMER HYDROCHLORIDE', 'SEVOFLURANE', 'SEYSARA', 'SEZABY', 'SIGNIFOR', 'SIKLOS', 'SILDENAFIL CITRATE', 'SILENOR', 'SILODOSIN', 'SILPHEN', 'SILVADENE', 'SIMVASTATIN', 'SINCALIDE', 'SINEQUAN', 'SINGULAIR', 'SINUVA', 'SIROLIMUS', 'SIRTURO', 'SITAGLIPTIN PHOSPHATE', 'SITAGLIPTIN PHOSPHATE; METFORMIN HYDROCHLORIDE', 'SITAGLIPTIN; METFORMIN HYDROCHLORIDE', 'SITAVIG', 'SKELID', 'SKLICE', 'SKYCLARYS', 'SKYLA', 'SLO-BID', 'SLOW-K', 'SLYND', 'SOAANZ', 'SODIUM ACETATE', 'SODIUM AMINOSALICYLATE', 'SODIUM BICARBONATE', 'SODIUM FERRIC GLUCONATE COMPLEX IN SUCROSE', 'SODIUM NITRITE', 'SODIUM NITROPRUSSIDE', 'SODIUM OXYBATE', 'SODIUM PHENYLBUTYRATE', 'SODIUM SULFACETAMIDE', 'SODIUM SULFATE; POTASSIUM SULFATE; MAGNESIUM SULFATE', 'SODIUM TETRADECYL SULFATE', 'SODIUM THIOSULFATE', 'SOFOSBUVIR', 'SOGROYA', 'SOHONOS', 'SOJOURN', 'SOLARAZE', 'SOLIFENACIN SUCCINAT

In [None]:
import json

def count_unique_contents(file_path):
    unique_contents = set()

    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            for message in data['messages']:
                content = message['content']
                unique_contents.add(content)

    return len(unique_contents)

: 