# Create Sample Data for Cleaning with Excel

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
med_data = pd.read_csv("../data/raw/Medical-Equipment-Suppliers.csv")
med_data.shape

(63112, 17)

In [3]:
med_data.dtypes

provider_id                 int64
acceptsassignement           bool
participationbegindate     object
businessname               object
practicename               object
practiceaddress1           object
practiceaddress2           object
practicecity               object
practicestate              object
practicezip9code            int64
telephonenumber             int64
specialitieslist           object
providertypelist           object
supplieslist               object
latitude                  float64
longitude                 float64
is_contracted_for_cba        bool
dtype: object

In [4]:
med_data['practicestate'].value_counts()

practicestate
CA    5233
NY    4924
TX    4728
FL    4452
PA    2827
IL    2274
OH    2251
NC    2112
GA    1983
MI    1964
NJ    1812
TN    1586
VA    1582
IN    1422
MO    1338
AZ    1238
MN    1212
MA    1196
WA    1195
WI    1184
AL    1146
KY    1146
MD    1058
SC     998
CO     994
LA     897
IA     878
OK     827
KS     739
CT     722
AR     698
OR     680
MS     642
NE     486
NV     486
WV     473
UT     423
ME     341
NM     336
ID     334
NH     287
SD     245
MT     237
PR     224
RI     208
DE     206
HI     190
ND     180
WY     150
VT     140
DC     111
AK     101
VI       9
GU       5
MP       2
Name: count, dtype: int64

In [5]:
ca_data = med_data[med_data['practicestate'] == 'CA'].copy()
sampled_med_data = ca_data.sample(n=1000, random_state=42).reset_index(drop=True)
sampled_med_data.shape

(1000, 17)

In [6]:
columns_to_keep = [
    "provider_id",
    "businessname",
    "practiceaddress1",
    "practicecity",
    "practicestate",
    "practicezip9code",
    "supplieslist",
    "specialitieslist",
    "telephonenumber"
]
sampled_med_data = sampled_med_data[columns_to_keep].copy()
sampled_med_data.head()

Unnamed: 0,provider_id,businessname,practiceaddress1,practicecity,practicestate,practicezip9code,supplieslist,specialitieslist,telephonenumber
0,20420603,VIVIDCITY PHARMACY LLC,12504 MAGNOLIA BLVD,VALLEY VILLAGE,CA,916072306,Oral Antiemetic Drugs|Epoetin|Oral Anticancer ...,Pharmacy,3239870315
1,20537201,THRIFTY PAYLESS INC,910 DIABLO AVE,NOVATO,CA,949477311,"Commodes, Urinals, Bedpans|Ostomy Supplies|Neb...",Pharmacy,4158981905
2,20369591,WALGREEN CO,1625 W SUNSET BLVD,LOS ANGELES,CA,900264226,Blood Glucose Monitors/Supplies (Non-Mail Orde...,Pharmacy,2134829286
3,20436101,GARFIELD BEACH CVS LLC,23806 MAIN ST,CARSON,CA,907455746,"Commodes, Urinals, Bedpans|Parenteral Equipmen...",Pharmacy,3109526640
4,20425667,GARFIELD BEACH CVS LLC,1400 FITGERALD DR,PINOLE,CA,945642250,Parenteral Equipment and/or Supplies|Ostomy Su...,Pharmacy,5102229281


In [7]:
duplicates = sampled_med_data.sample(n=10, random_state=7)
dirty_data = pd.concat([sampled_med_data, duplicates], ignore_index=True)

**few things below for later preprocessing** 

In [8]:
dirty_data.loc[15:25, 'businessname'] = ""

In [9]:
dirty_data.loc[100:110, 'telephonenumber'] = np.nan

In [10]:
dirty_data.loc[30, 'specialitieslist'] = "Pharamcy"   
dirty_data.loc[60, 'specialitieslist'] = "Pharacy"

In [11]:
dirty_data.loc[::20, 'practicestate'] = dirty_data['practicestate'].str.lower()

In [12]:
output_path = "../data/processed/medical_supplier_dirty_sample.xlsx"
dirty_data.to_excel(output_path, index=False)

In [13]:
dirty_data = pd.read_excel("../data/processed/medical_supplier_dirty_sample.xlsx")

In [14]:
region_lookup = {
    "LOS ANGELES": "Southern CA",
    "SAN DIEGO": "Southern CA",
    "VALLEY VILLAGE": "Southern CA",
    "SAN FRANCISCO": "Northern CA",
    "SACRAMENTO": "Northern CA",
    "OAKLAND": "Northern CA",
    "FRESNO": "Central CA",
    "BAKERSFIELD": "Central CA"
}

In [15]:
dirty_data["practicecity_upper"] = dirty_data["practicecity"].str.upper()
dirty_data["region"] = dirty_data["practicecity_upper"].map(region_lookup)
dirty_data.drop(columns=["practicecity_upper"], inplace=True)
dirty_data["region"] = dirty_data["region"].fillna("Unknown Region")

In [16]:
dirty_data["expiry_date"] = [
    datetime.today() + timedelta(days=int(np.random.randint(30, 1095)))
    for _ in range(len(dirty_data))
]
dirty_data["expiry_date"] = dirty_data["expiry_date"].dt.date

In [17]:
dirty_data["price"] = np.round(np.random.uniform(10, 500, len(dirty_data)), 2)
output_path = "../data/processed/medical_supplier_enhanced.xlsx"
dirty_data.to_excel(output_path, index=False)

In [18]:
dirty_data.columns

Index(['provider_id', 'businessname', 'practiceaddress1', 'practicecity',
       'practicestate', 'practicezip9code', 'supplieslist', 'specialitieslist',
       'telephonenumber', 'region', 'expiry_date', 'price'],
      dtype='object')