In [1]:
#Import pandas
import pandas as pd

#Load data tables from clinical trial data
patients = pd.read_csv('patients.csv')
treatments = pd.read_csv('treatments.csv')
adverse_reactions = pd.read_csv('adverse_reactions.csv')

## Helpful techniques for visual + programmatic assessment

In [2]:
#Visually inspect the adverse reactions table
adverse_reactions.head()

Unnamed: 0,given_name,surname,adverse_reaction
0,berta,napolitani,injection site discomfort
1,lena,baer,hypoglycemia
2,joseph,day,hypoglycemia
3,flavia,fiorentino,cough
4,manouck,wubbels,throat irritation


In [3]:
#Visually inspect the treatments table
treatments.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.2,
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32


In [4]:
#Visually inspect the info() results on treatments
treatments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  171 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [5]:
#Visually inspect the info() results on adverse_reactions
adverse_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   given_name        34 non-null     object
 1   surname           34 non-null     object
 2   adverse_reaction  34 non-null     object
dtypes: object(3)
memory usage: 944.0+ bytes


In [6]:
#Get dimensionality of dataframe
adverse_reactions.shape

(34, 3)

In [7]:
#Get all of the column labels in our dataset programmatically.
adverse_reactions.columns

Index(['given_name', 'surname', 'adverse_reaction'], dtype='object')

In [8]:
#Get the index of the data
adverse_reactions.index

RangeIndex(start=0, stop=34, step=1)

## A single observational unit is stored in multiple tables.

In [9]:
#Revisit adverse reactions table for visual inspection
adverse_reactions.head()

Unnamed: 0,given_name,surname,adverse_reaction
0,berta,napolitani,injection site discomfort
1,lena,baer,hypoglycemia
2,joseph,day,hypoglycemia
3,flavia,fiorentino,cough
4,manouck,wubbels,throat irritation


In [10]:
#Revisit treatments table for visual inspection
treatments.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.2,
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32


## Variables are stored in both rows and columns.

In [11]:
#We see variables are stored in both rows and columns with auralin and novodra columns
treatments[["auralin","novodra"]]

Unnamed: 0,auralin,novodra
0,41u - 48u,-
1,-,40u - 45u
2,-,39u - 36u
3,33u - 36u,-
4,-,33u - 29u
...,...,...
275,45u - 51u,-
276,-,49u - 49u
277,23u - 36u,-
278,31u - 38u,-


In [12]:
#Programmatic assessment: Get rows where BOTH auralin and novodra
#have values at the same time. There should be no rows retured, since a patient
#will not take both drugs at the same time. We can see that's the case below.

treatments[((treatments['auralin'] != '-') & (treatments['novodra'] != '-'))]

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change


With this confirmation, we could proceed during the cleaning stage to simplify our data structure to solve this issue by creating three columns - for the treatment specifying auralin/novodra, the start dose, and the end dose.

## Multiple variables being stored in one column.

In [13]:
#Look at patients dataframe - visually inspect the first few rows
patients.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,951-719-9170ZoeWellish@superrito.com,7/10/1976,121.7,66,19.6
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,PamelaSHill@cuvox.de+1 (217) 569-3204,4/3/1967,118.8,66,19.2
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,402-363-6804JaeMDebord@gustr.com,2/19/1980,177.8,71,24.8
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,PhanBaLiem@jourrapide.com+1 (732) 636-8246,7/26/1951,220.9,70,31.7
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [14]:
#Look at patients dataframe value_counts to inspect validity
#Do the smushed up values all follow the same format?
#304-438-2648SandraCTaylor@dayrep.com confirms this isn't the case.

patients['contact'].value_counts()

johndoe@email.com1234567890                        6
PatrickGersten@rhyta.com402-848-4923               2
304-438-2648SandraCTaylor@dayrep.com               2
JakobCJakobsen@einrot.com+1 (845) 858-7707         2
PavelFilipek@rhyta.com1 952 431 5166               1
                                                  ..
CoralieAllaire@armyspy.com+1 (828) 586-5050        1
ChibuzoOkoli@einrot.com+1 (918) 971-5864           1
EllenRLuman@einrot.com920-849-0384                 1
LeVietThong@gustr.com+1 (612) 208-2965             1
ChidaluOnyekaozulu@jourrapide.com1 360 443 2060    1
Name: contact, Length: 483, dtype: int64