In [44]:
import pandas as pd
import numpy as np
import csv
from itertools import groupby, islice
from multiprocessing import cpu_count, Pool
import gc

def get_count_and_sum(drug_name, drugs_group):
    """Get Unique Count and Total Sum for a particular drug
    
    Args:
        drug_name: Name of the drug.
        drugs_group: Collection of records(dict) for the given drug_name
    Returns:
        A dict type with 'total_cost' (Total cost of that drug) and 'num_prescriber' (Number of unique prescribers)
    """

    unique_count = 0 # tracks unique count
    total_sum = 0 # tracks total sum
    
    null_count = 0
    total_count = 0
    
    unique_ids = set() # hashset for efficiency, maintains unique set of records

    for record in drugs_group:
        
        total_count += 1
        try:
            dc = float(record["drug_cost"]) if '.' in record["drug_cost"] else int(record["drug_cost"])
        
        except ValueError as e: # Handling NULL Values or Any Non Number value
            dc = 0
            null_count += 1
        
        total_sum += dc
        
        rid = record["id"]

        if rid not in unique_ids: # check if id of prescriber is unique
            unique_ids.add(rid)
            unique_count += 1

    avg_sum = total_sum // total_count
    
    total_sum = total_sum + avg_sum * null_count # NULL VALUES REPLACED WITH AVERAGE
    
    new_record = {"drug_name": drug_name,
                  "num_prescriber": unique_count,
                  "total_cost": total_sum}
    
    return new_record


def groupby_count_and_sum(data_path):
    """Reads a csv from the given path, groups by drug_name and returns unique count and total cost for each drug.
    
    Args:
        data_path: String path to csv (Example: "../input/de_cc_data.txt")
    Returns:
        A List type, where each element is a dictionary of format {"drug_name": NAME OF THE DRUG, 
                                                                   "num_prescriber": UNIQUE PRESCRIBER COUNT,                  
                                                                   "total_cost": TOTAL COST OF THAT DRUG}
    """
    new_records = []
    print("..reading input from {}".format(data_path))
    with open(data_path, newline='') as in_file:
        
        records = csv.DictReader(in_file) # Reader that maps the information read into a dict
        
        # Sort records based on drug_name, required for python groupby
        print("Sorting..")
        drug_sorted_records = sorted(records, key = lambda r: r["drug_name"])
        
        # Groups based on drugname
        print("Grouping..")
        drugs_key_group = groupby(drug_sorted_records, key = lambda r: r["drug_name"])
        del records, drug_sorted_records; gc.collect()    
        
        # iterate over groups
        print("Creating desired records..")
        for drug_name, drugs_group in drugs_key_group:
            result = get_count_and_sum(drug_name, drugs_group)
            new_records.append(result)
        
        # free some memory, remove useless data
        del drugs_key_group; gc.collect()
    
    return new_records


def main(input_filepath):
    """Reads I/P, count unique, sum cost, Writes O/P"""
    
    # get desired result of unique counts and total cost from data in the file
    new_records = groupby_count_and_sum(input_filepath)
    
    # stable sort in reverse order on total_cost, names are already sorted
    new_records = sorted(new_records, key = lambda r: r["total_cost"], reverse=True)
    
    return pd.DataFrame(new_records)

In [19]:
def test(df1, df2):
    return np.all(df1.num_prescriber.values == df2.num_prescriber.values), np.all(df1.total_cost.values == df2.total_cost.values)

In [2]:
data_path = "../insight_testsuite/tests/test/input/itcont.txt"

it_path = "../insight_testsuite/tests/test_2/input/itcont.txt"
top_path = "../insight_testsuite/tests/test_2/output/top_cost_drug.txt"

In [3]:
df = pd.read_csv(data_path)

In [4]:
sample = df.sample(1000)
del df; gc.collect()

7

In [17]:
sample.to_csv(it_path, index=False)

In [41]:
sample = pd.read_csv(it_path)

In [42]:
sample['drug_cost'] = sample['drug_cost'].astype('int32')
sample = sample.reset_index(drop=True)
sample.head()

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
sample = pd.read_csv("")

In [13]:
out = pd.DataFrame()
ndf = sample.groupby(by=["drug_name"]).id.nunique()
out['num_prescriber'] = ndf
out['drug_name'] = ndf.index
out.head()

Unnamed: 0_level_0,num_prescriber,drug_name
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1
ABILIFY,1,ABILIFY
ACETAMINOPHEN-CODEINE,1,ACETAMINOPHEN-CODEINE
ACYCLOVIR,1,ACYCLOVIR
ADVAIR DISKUS,3,ADVAIR DISKUS
AGGRENOX,1,AGGRENOX


In [14]:
costs = sample.groupby(by=["drug_name"]).agg({"drug_cost":"sum"})
costs.head()

Unnamed: 0_level_0,drug_cost
drug_name,Unnamed: 1_level_1
ABILIFY,51341
ACETAMINOPHEN-CODEINE,215
ACYCLOVIR,339
ADVAIR DISKUS,36316
AGGRENOX,5085


In [15]:
out = pd.concat([costs, out], sort=True, axis=1) \
                .reset_index(drop=True) \
                .rename(columns={"drug_cost":"total_cost"}) \
                .sort_values("drug_name", ascending=True) \
                .sort_values("total_cost", ascending=False, kind="mergesort")
out.head()

Unnamed: 0,total_cost,num_prescriber,drug_name
116,511098,1,ENBREL
186,175204,3,LATUDA
288,154569,3,RESTASIS
83,152177,6,CRESTOR
184,107632,5,LANTUS SOLOSTAR


In [48]:
manual = main(it_path)
manual.head()

..reading input from ../insight_testsuite/tests/test_2/input/itcont.txt
Sorting..
Grouping..
Creating desired records..


Unnamed: 0,drug_name,num_prescriber,total_cost
0,ENBREL,1,511098.0
1,LATUDA,3,175204.0
2,CRESTOR,6,167440.0
3,RESTASIS,3,154569.0
4,LANTUS SOLOSTAR,5,119576.0


In [35]:
manual.sort_values("num_prescriber", ascending=False)

Unnamed: 0,drug_name,num_prescriber,total_cost
20,ATORVASTATIN CALCIUM,15,43974
59,AMLODIPINE BESYLATE,15,13950
48,GABAPENTIN,14,18576
95,ATENOLOL,13,6417
21,METOPROLOL SUCCINATE,13,41334
58,SIMVASTATIN,12,14107
79,PANTOPRAZOLE SODIUM,12,8911
83,METOPROLOL TARTRATE,12,8182
51,LOSARTAN POTASSIUM,12,17201
33,LEVOTHYROXINE SODIUM,12,26793


In [21]:
test(out, manual)

(True, True)

In [None]:
'LATUDA', 'RESTASIS', 'CRESTOR', 'LANTUS SOLOSTAR'

In [30]:
sample.loc[604, 'drug_cost'] = np.NaN
sample.loc[(sample.drug_name == "CRESTOR") & (sample.id == 1962581157)]

Unnamed: 0,id,prescriber_last_name,prescriber_first_name,drug_name,drug_cost
604,1962581157,TALTON-WILLIAMSON,STEPHANIE,CRESTOR,


In [46]:
sample.loc[209, 'drug_cost'] = 'wrong_value'
sample.loc[(sample.drug_name == "LANTUS SOLOSTAR")  & (sample.id == 1871519645)]

Unnamed: 0,id,prescriber_last_name,prescriber_first_name,drug_name,drug_cost
209,1871519645,WONG,CYNTHIA,LANTUS SOLOSTAR,wrong_value


In [36]:
sample.loc[(sample.drug_name == "ATORVASTATIN CALCIUM")]

Unnamed: 0,id,prescriber_last_name,prescriber_first_name,drug_name,drug_cost
2,1043517386,AVERY,NATALIE,ATORVASTATIN CALCIUM,919.0
140,1003932955,TOWNSEND,JACOB,ATORVASTATIN CALCIUM,3800.0
200,1740506948,BUNCE,KATHERINE,ATORVASTATIN CALCIUM,1086.0
242,1447219415,MANOLACHE,PETRICA,ATORVASTATIN CALCIUM,3008.0
371,1962627653,MCNEELY,CYNTHIA,ATORVASTATIN CALCIUM,2598.0
540,1972571552,ROSENBAUM,LEWIS,ATORVASTATIN CALCIUM,4332.0
651,1851457477,CONSIGLIERE,GINO,ATORVASTATIN CALCIUM,6144.0
664,1154586840,GIFFORD,SUSAN,ATORVASTATIN CALCIUM,5279.0
719,1346232493,POLYCHRONOPOULOS,SOTERIOS,ATORVASTATIN CALCIUM,2111.0
769,1114050630,WILEMAN,ROCHELLE,ATORVASTATIN CALCIUM,455.0


In [39]:
sample.loc[2, "drug_cost"] = np.NaN
sample.loc[(sample.drug_name == "ATORVASTATIN CALCIUM")]

Unnamed: 0,id,prescriber_last_name,prescriber_first_name,drug_name,drug_cost
2,1043517386,AVERY,NATALIE,ATORVASTATIN CALCIUM,
140,1003932955,TOWNSEND,JACOB,ATORVASTATIN CALCIUM,3800.0
200,1740506948,BUNCE,KATHERINE,ATORVASTATIN CALCIUM,1086.0
242,1447219415,MANOLACHE,PETRICA,ATORVASTATIN CALCIUM,3008.0
371,1962627653,MCNEELY,CYNTHIA,ATORVASTATIN CALCIUM,2598.0
540,1972571552,ROSENBAUM,LEWIS,ATORVASTATIN CALCIUM,4332.0
651,1851457477,CONSIGLIERE,GINO,ATORVASTATIN CALCIUM,6144.0
664,1154586840,GIFFORD,SUSAN,ATORVASTATIN CALCIUM,5279.0
719,1346232493,POLYCHRONOPOULOS,SOTERIOS,ATORVASTATIN CALCIUM,
769,1114050630,WILEMAN,ROCHELLE,ATORVASTATIN CALCIUM,455.0


In [47]:
sample.to_csv(it_path, index=False)

In [2]:
import argparse

parser = argparse.ArgumentParser(description='Pharmacy Counting Problem: Input and Output Paths..')
parser.add_argument('input_filepath', metavar='INPUT',
                    type=str, help='path to input csv file')
parser.add_argument('output_filepath', metavar='OUTPUT', 
                    type=str, help='path to input csv file')

args = parser.parse_args(['BAR', 'FOO'])

args.input_filepath

'BAR'

In [4]:
sample = df.sample(10000000)
sample.to_csv("../input/sample.csv", index=False)

In [9]:
del df, sample; gc.collect()

0

In [2]:
cpus = cpu_count()
cpus

8

In [3]:
#data_path = "../input/de_cc_data.txt"
data_path = "../input/sample.csv"

In [4]:
cols = [ 'id', 'prescriber_last_name', 'prescriber_first_name', 'drug_name', 'drug_cost', ]
new_cols = ['drug_name', 'num_prescriber', 'total_cost']

In [3]:
things = [("animal", "bear"), ("animal", "duck"), ("animal", "tiger"),
          ("plant", "cactus"),
          ("vehicle", "speed boat"), ("vehicle", "school bus"),
         ("drink", "juice"), ("drink", "shake"),
         ("food", "chicken"), ("food", "paneer"),
         ("number", "1"), ("number", "2"),
         ("game", "cricket"), ("game", "football"),]


key_group = groupby(things, lambda x: x[0])
key_group = grouper(2,  key_group)

for groups in key_group:
    print([(k, list(g)) for k, g in groups])


[('animal', [('animal', 'bear'), ('animal', 'duck'), ('animal', 'tiger')]), ('plant', [('plant', 'cactus')])]
[('vehicle', [('vehicle', 'speed boat'), ('vehicle', 'school bus')]), ('drink', [('drink', 'juice'), ('drink', 'shake')])]
[('food', [('food', 'chicken'), ('food', 'paneer')]), ('number', [('number', '1'), ('number', '2')])]
[('game', [('game', 'cricket'), ('game', 'football')])]


# Extra

In [3]:
import os
import time
from contextlib import contextmanager
import psutil


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('[{}] done in {:.0f} s'.format(name, time.time() - t0))


def cpuStats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('MEMORY GB:', memoryUse)

# Linear

In [4]:
with timer("sort_group"):
    with open(data_path, newline='') as file:
        records = csv.DictReader(file)
        new_records = []
        
        cpuStats()
        
        # groupby drugname
        print("Sorting..")
        drug_sorted_records = sorted(records, key = lambda r: r["drug_name"])
        cpuStats()
        
        print("Grouping..")
        drugs_key_group = groupby(drug_sorted_records, key = lambda r: r["drug_name"])
        cpuStats()
        del records, drug_sorted_records; gc.collect()    
        cpuStats()
        
        # iterate over groups
        print("Building required records..")
        for drug_name, drugs_group in drugs_key_group:
            result = get_count_and_sum(drug_name, drugs_group)
            new_records.append(result)
        
        del drugs_key_group; gc.collect()
        cpuStats()

MEMORY GB: 0.08126449584960938
Sorting..
MEMORY GB: 14.263427734375
Grouping..
MEMORY GB: 14.263427734375
MEMORY GB: 14.263427734375
Building required records..
MEMORY GB: 5.899135589599609
[sort_group] done in 309 s


In [5]:
new_records = sorted(new_records, key = lambda r: r["total_cost"], reverse=True)

In [9]:
# write records
OUTPUT_HEADER = ["drug_name", "num_prescriber", "total_cost"]

with open(output_filepath, 'w') as out_file:
    writer = csv.DictWriter(out_file, fieldnames=OUTPUT_HEADER)
    writer.writeheader()
    for record in new_records:
        writer.writerow(record)

In [7]:
manual_result = pd.DataFrame(new_records)
print(manual_result.shape)
manual_result.head()

(2749, 3)


Unnamed: 0,drug_name,num_prescriber,total_cost
0,HARVONI,5221,5992223000.0
1,CRESTOR,147287,2704097000.0
2,LANTUS SOLOSTAR,106353,2237689000.0
3,ADVAIR DISKUS,126943,2038326000.0
4,SPIRIVA,114329,1966117000.0


# Parallel

In [4]:
with timer("sort_group"):
    with open(data_path, newline='') as file:
        records = csv.DictReader(file)
        new_records = []

        # groupby drugname
        cpuStats()
        print("Sorting..")
        drug_sorted_records = sorted(records, key = lambda r: r["drug_name"], reverse=True)
        cpuStats()
        
        print("Grouping..")
        grouped = groupby(drug_sorted_records, key = lambda r: r["drug_name"])
        
        print("Chunking..")
        drugs_key_group = grouper(cpu_count(), grouped)
        cpuStats()
        
        del records, drug_sorted_records; gc.collect()    
        cpuStats()
        
        # iterate over groups
        
        pool = Pool(processes=cpu_count())
        for sublist_key_group in drugs_key_group:
            list_of_key_group = ((drug, tuple(group)) for drug, group in sublist_key_group)
            
            del sublist_key_group; gc.collect() 

            ## parallel process groups
            result = pool.map(get_count_and_sum, list_of_key_group)
            new_records.extend(result)
            
            del result; gc.collect() 
        
        del drugs_key_group; gc.collect()
        cpuStats()
        
        pool.close()

MEMORY GB: 0.08770370483398438
Sorting..
MEMORY GB: 14.269054412841797
Grouping..
Chunking..
MEMORY GB: 14.269054412841797
MEMORY GB: 14.269054412841797


OSError: [Errno 12] Cannot allocate memory

In [69]:
manual_result2 = pd.DataFrame(new_records)
manual_result2.head(10)

Unnamed: 0,drug_name,num_prescriber,total_cost
0,ZYVOX,15,1034700.75
1,ZYTIGA,473,95977667.74
2,ZYPREXA ZYDIS,27,368692.7
3,ZYPREXA RELPREVV,30,929145.71
4,ZYPREXA,126,1803823.0
5,ZYMAXID,25,158568.82
6,ZYLOPRIM,5,11312.26
7,ZYLET,26,126184.81
8,ZYKADIA,8,1010524.17
9,ZYFLO CR,21,1085320.46


In [47]:
manual_result.loc[manual_result['drug_name']=='INSULIN SYRINGE']

Unnamed: 0,drug_name,num_prescriber,total_cost
1058,INSULIN SYRINGE,1753,1069616.92


In [None]:
%%time
with open(data_path, newline='') as file:
    records = csv.DictReader(file)
    new_records = []

    # groupby drugname
    drug_sorted_records = sorted(records, key = lambda r: r["drug_name"].strip().lower(), reverse=True)
    drugs_key_group = groupby(drug_sorted_records, key = lambda r: r["drug_name"])

    drugs_key_group = split_every(cpu_count(), drugs_key_group)

    # iterate over groups
    pool = Pool(processes=cpu_count())
    
    for sublist_key_group in drugs_key_group:
        sublist_key_group = [(drug, list(group)) for drug, group in sublist_key_group]
        ## parallel process groups
        result = pool.map(get_count_and_sum, sublist_key_group)
        new_records.extend(result)
    
    pool.close()

Unnamed: 0,drug_name,num_prescriber,total_cost
0,ZYTIGA,18,3525985.23
1,ZYPREXA ZYDIS,2,21982.04
2,ZYPREXA RELPREVV,2,59365.17
3,ZYPREXA,5,75998.54
4,ZYLOPRIM,1,1503.72


In [25]:
manual_result = pd.DataFrame(new_records)
manual_result.sample(10)

Unnamed: 0,drug_name,num_prescriber,total_cost
141,TRAMADOL HCL ER,0,0.0
874,FLUVOXAMINE MALEATE,0,0.0
271,SALSALATE,2,6110.77
672,LIALDA,0,0.0
5,ZYLET,0,0.0
354,PRIVIGEN,0,0.0
1260,AZILECT,0,0.0
327,PROVENTIL HFA,26,49643.0
965,EMTRIVA,0,0.0
719,KLOR-CON,1,10378.62


In [19]:
manual_result.shape

(1384, 3)

In [20]:
manual_result.sample(10)

Unnamed: 0,drug_name,num_prescriber,total_cost
281,RIVASTIGMINE,53,169416.58
808,HUMALOG,142,1537018.18
1362,ACZONE,1,4551.34
639,LUNESTA,6,25004.68
501,NORETHINDRONE,1,73.31
1066,DEMECLOCYCLINE HCL,3,12429.16
389,POLYETHYLENE GLYCOL 3350,310,267261.38
1285,ASSURE ID INSULIN SAFETY,1,288.66
1232,BETOPTIC S,3,39712.16
188,TECFIDERA,10,2174326.25


# Using Pandas

In [23]:
data_path = "../insight_testsuite/tests/test_2/input/itcont.txt"
sample = pd.read_csv(data_path)
sample = sample.sort_values(by='drug_name')
sample.head()

Unnamed: 0,id,prescriber_last_name,prescriber_first_name,drug_name,drug_cost
3608822,1184686909,CHALISA,NUZHAT,1ST TIER UNIFINE PENTIPS,355.97
3752247,1386631851,CHAUDHURI,AJAY,1ST TIER UNIFINE PENTIPS,699.47
23326493,1679626097,WEBER,MICHAEL,1ST TIER UNIFINE PENTIPS,208.11
18269093,1801996756,REYER,JEFFREY,1ST TIER UNIFINE PENTIPS,372.48
16750424,1891801692,PATEL,PRATIP,1ST TIER UNIFINE PENTIPS,283.09


In [24]:
out = pd.DataFrame()
ndf = sample.groupby(by=["drug_name"]).id.nunique()
out['num_prescriber'] = ndf
out['drug_name'] = ndf.index
out.head()

Unnamed: 0_level_0,num_prescriber,drug_name
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1ST TIER UNIFINE PENTIPS,88,1ST TIER UNIFINE PENTIPS
1ST TIER UNIFINE PENTIPS PLUS,9,1ST TIER UNIFINE PENTIPS PLUS
ABACAVIR,2002,ABACAVIR
ABACAVIR-LAMIVUDINE-ZIDOVUDINE,462,ABACAVIR-LAMIVUDINE-ZIDOVUDINE
ABELCET,12,ABELCET


In [25]:
out = pd.concat([sample.groupby(by=["drug_name"]).agg({"drug_cost":"sum"}), out], sort=True, axis=1) \
.reset_index(drop=True).rename(columns={"drug_cost":"total_cost"}).sort_values("drug_name", ascending=True).sort_values("total_cost", ascending=False, kind="mergesort")

In [34]:
out = out.reset_index(drop=True)
out.head()

Unnamed: 0,total_cost,num_prescriber,drug_name
0,5992223000.0,5221,HARVONI
1,2704097000.0,147287,CRESTOR
2,2237689000.0,106353,LANTUS SOLOSTAR
3,2038326000.0,126943,ADVAIR DISKUS
4,1966117000.0,114329,SPIRIVA


In [6]:
out.to_csv("../insight_testsuite/tests/test_2/output/top_cost_drug.txt", index=False)

In [12]:
manual_result.head()

Unnamed: 0,drug_name,num_prescriber,total_cost
0,HARVONI,5221,5992223000.0
1,CRESTOR,147287,2704097000.0
2,LANTUS SOLOSTAR,106353,2237689000.0
3,ADVAIR DISKUS,126943,2038326000.0
4,SPIRIVA,114329,1966117000.0


In [27]:
np.all(manual_result.num_prescriber.values == out.num_prescriber.values)

True

In [28]:
manual_result.loc[manual_result.num_prescriber.values.astype(np.int) != out.num_prescriber.values.astype(np.int)].head(20)

Unnamed: 0,drug_name,num_prescriber,total_cost


In [29]:
out.loc[manual_result.num_prescriber.values != out.num_prescriber.values].head(20)

Unnamed: 0,total_cost,num_prescriber,drug_name


In [30]:
np.all(manual_result.num_prescriber.values == out.num_prescriber.values)
np.all(manual_result.total_cost.values == out.total_cost.values)

False

In [35]:
manual_result.loc[manual_result.total_cost.values.astype(np.int) != out.total_cost.values.astype(np.int)].head(20)

Unnamed: 0,drug_name,num_prescriber,total_cost
669,OXYMORPHONE HCL,1799,10983168.0
889,NYAMYC,5188,4706179.0
1310,DANAZOL,241,878510.0
1562,PHOSPHA 250 NEUTRAL,569,344590.0
1677,FLOMAX,57,211179.0
1764,PROCTOFOAM-HC,73,154122.0
2224,DEXTROSE IN LACTATED RINGERS,20,19492.0


In [38]:
out.loc[manual_result.total_cost.values.astype('int16') != out.total_cost.values.astype('int16')].head(20)

Unnamed: 0,total_cost,num_prescriber,drug_name
669,10983168.0,1799,OXYMORPHONE HCL
889,4706179.0,5188,NYAMYC
1310,878510.0,241,DANAZOL
1562,344590.0,569,PHOSPHA 250 NEUTRAL
1677,211179.0,57,FLOMAX
1764,154122.0,73,PROCTOFOAM-HC
2224,19492.0,20,DEXTROSE IN LACTATED RINGERS


In [42]:
int(manual_result.iloc[669]['total_cost']) == int(out.iloc[669]['total_cost'])

False

In [16]:
cdf = cdf.sort_values("drug_cost", ascending=False)
cdf.head()

Unnamed: 0_level_0,drug_cost
drug_name,Unnamed: 1_level_1
HARVONI,5992223000.0
CRESTOR,2704097000.0
LANTUS SOLOSTAR,2237689000.0
ADVAIR DISKUS,2038326000.0
SPIRIVA,1966117000.0


In [17]:
cdf.shape

(2749, 1)

In [27]:
np.all(cdf.drug_cost.values.astype(np.int) == manual_result.total_cost.values.astype(np.int))

False

In [35]:
cdf.loc[cdf.drug_cost.values.astype(np.int) != manual_result.total_cost.values.astype(np.int)].head(6)

Unnamed: 0_level_0,drug_cost
drug_name,Unnamed: 1_level_1
PROCTOFOAM-HC,154122.0
PHOSPHA 250 NEUTRAL,344590.0
NYAMYC,4706179.0
DEXTROSE IN LACTATED RINGERS,19492.0
DANAZOL,878510.0
BUTISOL SODIUM,69254.0


In [36]:
manual_result.loc[cdf.drug_cost.values.astype(np.int) != manual_result.total_cost.values.astype(np.int)].head(6)

Unnamed: 0,drug_name,num_prescriber,total_cost
758,PROCTOFOAM-HC,73,154122.0
871,PHOSPHA 250 NEUTRAL,569,344590.0
1024,NYAMYC,5188,4706179.0
2072,DEXTROSE IN LACTATED RINGERS,20,19492.0
2143,DANAZOL,241,878510.0
2404,BUTISOL SODIUM,24,69254.0


In [30]:
manual_result.loc[manual_result.drug_name == 'PROCTOFOAM-HC'].total_cost

758    154122.0
Name: total_cost, dtype: float64

In [32]:
cdf.loc[cdf.index == 'PROCTOFOAM-HC'].drug_cost

drug_name
PROCTOFOAM-HC    154122.0
Name: drug_cost, dtype: float64

In [None]:
np.all()

In [66]:
np.all(ndf == manual_result2.num_prescriber.values)

True

In [None]:
ndf.head()

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
things = [("animal", "bear"), ("animal", "duck"), ("plant", "cactus"), ("vehicle", "speed boat"), ("vehicle", "school bus")]

for key, group in groupby(things, lambda x: x[0]):
    for thing in group:
        print ("A %s is a %s." % (thing[1], key))
    print (" ")