# Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy import stats

In [2]:
# Load data, setting first column as DataFrame index
df = pd.read_csv('medical_raw_data.csv', index_col=0)
df.head()

Unnamed: 0,CaseOrder,Customer_id,Interaction,UID,City,State,County,Zip,Lat,Lng,...,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
1,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932,Eva,AL,Morgan,35621,34.3496,-86.72508,...,3191.048774,17939.40342,3,3,2,2,4,3,3,4
2,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195,Marianna,FL,Jackson,32446,30.84513,-85.22907,...,4214.905346,17612.99812,3,4,3,4,4,4,3,3
3,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9,Sioux Falls,SD,Minnehaha,57110,43.54321,-96.63772,...,2177.586768,17505.19246,2,4,4,4,3,4,3,3
4,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07,New Richland,MN,Waseca,56072,43.89744,-93.51479,...,2465.118965,12993.43735,3,5,5,3,4,5,5,5
5,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a,West Point,VA,King William,23181,37.59894,-76.88958,...,1885.655137,3716.525786,2,1,3,3,5,3,4,3


# Initial Exploration

In [3]:
df.describe()

Unnamed: 0,CaseOrder,Zip,Lat,Lng,Population,Children,Age,Income,VitD_levels,Doc_visits,...,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
count,10000.0,10000.0,10000.0,10000.0,10000.0,7412.0,7586.0,7536.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,50159.3239,38.751099,-91.24308,9965.2538,2.098219,53.295676,40484.438268,19.412675,5.0122,...,5891.538261,12934.528586,3.5188,3.5067,3.5111,3.5151,3.4969,3.5225,3.494,3.5097
std,2886.89568,27469.588208,5.403085,15.205998,14824.758614,2.155427,20.659182,28664.86105,6.723277,1.045734,...,3377.558136,6542.601544,1.031966,1.034825,1.032755,1.036282,1.030192,1.032376,1.021405,1.042312
min,1.0,610.0,17.96719,-174.20969,0.0,0.0,18.0,154.08,9.519012,1.0,...,1256.751699,3125.702716,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2500.75,27592.0,35.25512,-97.352982,694.75,0.0,35.0,19450.7925,16.513171,4.0,...,3253.239465,7986.487642,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,5000.5,50207.0,39.419355,-88.39723,2769.0,1.0,53.0,33942.28,18.08056,5.0,...,5852.250564,11573.979365,4.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0
75%,7500.25,72411.75,42.044175,-80.43805,13945.0,3.0,71.0,54075.235,19.78974,6.0,...,7614.989701,15626.491033,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,10000.0,99929.0,70.56099,-65.29017,122814.0,10.0,89.0,207249.13,53.019124,9.0,...,21524.22421,30566.07313,8.0,7.0,8.0,7.0,7.0,7.0,7.0,7.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 1 to 10000
Data columns (total 52 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CaseOrder           10000 non-null  int64  
 1   Customer_id         10000 non-null  object 
 2   Interaction         10000 non-null  object 
 3   UID                 10000 non-null  object 
 4   City                10000 non-null  object 
 5   State               10000 non-null  object 
 6   County              10000 non-null  object 
 7   Zip                 10000 non-null  int64  
 8   Lat                 10000 non-null  float64
 9   Lng                 10000 non-null  float64
 10  Population          10000 non-null  int64  
 11  Area                10000 non-null  object 
 12  Timezone            10000 non-null  object 
 13  Job                 10000 non-null  object 
 14  Children            7412 non-null   float64
 15  Age                 7586 non-null   float64
 16  Education

In [5]:
df.shape

(10000, 52)

# C4: Data Cleaning Code (Detection)

The code in this section is simply designed to identify the issues rather than perform any cleaning or wrangling steps at this time.

In [6]:
# If we were to drop all rows with ANY NA values, there would only be 2,313 rows with all values out of 10,000
# no_missing_data = df.dropna()
# no_missing_data.info()

If we were to drop all rows with _any_ NA values, there would only be 2,313 rows with all values out of the original 10,000 rows. This is not an option for cleaning this dataset.

### Check that all four patient identifier variables are unique

There are 10,000 rows in the dataset. The following columns are used as unique (encrypted) identifiers to identify unique patients that have been admitted: `'CaseOrder'`, `'Customer_id'`, `'Interaction'`, `'UID'`.

Therefore, each column listed should have 10,000 unique values.

Use `df['col'].nunique()` to retrieve a count of the number of unique values in a column (McCoy 2024).

In [7]:
# Check that the four identifying variables have 10,000 unique values, one for each row of data
df['CaseOrder'].nunique()

10000

In [8]:
# Check that the four identifying variables have 10,000 unique values, one for each row of data
df['Customer_id'].nunique()

10000

In [9]:
# Check that the four identifying variables have 10,000 unique values, one for each row of data
df['Interaction'].nunique()

10000

In [10]:
# Check that the four identifying variables have 10,000 unique values, one for each row of data
df['UID'].nunique()

10000

In [11]:
unique_identifiers = df[['CaseOrder', 'Customer_id', 'Interaction', 'UID']]
unique_identifiers

Unnamed: 0,CaseOrder,Customer_id,Interaction,UID
1,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932
2,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195
3,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9
4,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07
5,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a
...,...,...,...,...
9996,9996,B863060,a25b594d-0328-486f-a9b9-0567eb0f9723,39184dc28cc038871912ccc4500049e5
9997,9997,P712040,70711574-f7b1-4a17-b15f-48c54564b70f,3cd124ccd43147404292e883bf9ec55c
9998,9998,R778890,1d79569d-8e0f-4180-a207-d67ee4527d26,41b770aeee97a5b9e7f69c906a8119d7
9999,9999,E344109,f5a68e69-2a60-409b-a92f-ac0847b27db0,2bb491ef5b1beb1fed758cc6885c167a
