# Importing and Cleaning of Data 
From CSV file back to a "cleaned" CSV

In [1]:
#Importing Libraries
import pandas as pd
from pathlib import Path 
import requests
import json
import time
import numpy as np
import re

In [2]:
#Read CSV file into a Dataframe
vaccine_providers = Path("../Resources/Vaccines.gov__COVID-19_vaccinating_provider_locations_20241118.csv")
providers_df = pd.read_csv(vaccine_providers)

providers_df.head()

  providers_df = pd.read_csv(vaccine_providers)


Unnamed: 0,loc_phone,loc_name,loc_admin_street1,loc_admin_city,loc_admin_state,loc_admin_zip,web_address,insurance_accepted,walkins_accepted,med_name,in_stock,quantity_last_updated,latitude,longitude
0,(504) 737-6242,"CVS Pharmacy, Inc. #5340",9643-B JEFFERSON HWY,RIVER RIDGE,LA,70123,https://www.cvs.com/store-locator/details-dire...,True,True,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,29.957522,-90.217062
1,(215) 836-1323,"CVS Pharmacy, Inc. #2113",1600 WADSWORTH AVENUE,PHILADELPHIA,PA,19150,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COVID-19 Vaccine, 10mcg/0.3 m...",False,2024-08-01,40.080415,-75.171958
2,(619) 315-0016,"CVS Pharmacy, Inc. #16504",2911 JAMACHA RD,EL CAJON,CA,92019,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",True,2024-08-01,32.738217,-116.938327
3,253-851-6939,Rite Aid #RA105255,"4818 POINT FOSDICK DR.,NW",GIG HARBOR,WA,98335-1711,https://www.riteaid.com,True,False,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,47.302477,-122.581456
4,7153929520.0,Walmart Inc #10-1447,3705 Tower Ave,Superior,WI,54880-5338,https://www.walmart.com/store/1447,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,46.694518,-92.106958


In [4]:
#Location Name          object
#Phone Number           object
#Street Address         object
#City                   object
#State                  object
#Zip Code               object
#Website                object
#Insurance Accepted       bool
#Walk-Ins Accepted        bool
#Types of Vaccine       object
#In Stock                 bool
#Last Updated           object
#Latitude              float64
#Longitude             float64
#dtype: object

In [5]:
#Rename and reorganize columns of the DataFrame
renamed_df = providers_df.rename(columns={"loc_phone":"Phone Number", 
                                         "loc_name":"Location Name", 
                                         "loc_admin_street1":"Street Address",
                                         "loc_admin_city":"City", 
                                         "loc_admin_state":"State", 
                                         "loc_admin_zip":"Zip Code", 
                                         "web_address":"Website", 
                                         "insurance_accepted":"Insurance Accepted",
                                          "walkins_accepted": "Walk-Ins Accepted",
                                          "med_name":"Types of Vaccine", 
                                          "in_stock":"In Stock", 
                                          "quantity_last_updated":"Last Updated", 
                                          "latitude": "Latitude", 
                                          "longitude":"Longitude"})
vaccine_providers_df = renamed_df[["Location Name", 
                                   "Phone Number", 
                                   "Street Address", 
                                   "City", 
                                   "State", 
                                   "Zip Code",
                                  "Website",
                                  "Insurance Accepted",
                                  "Walk-Ins Accepted", 
                                  "Types of Vaccine",
                                  "In Stock",
                                  "Last Updated", 
                                  "Latitude", 
                                  "Longitude"]]
vaccine_providers_df.head()

Unnamed: 0,Location Name,Phone Number,Street Address,City,State,Zip Code,Website,Insurance Accepted,Walk-Ins Accepted,Types of Vaccine,In Stock,Last Updated,Latitude,Longitude
0,"CVS Pharmacy, Inc. #5340",(504) 737-6242,9643-B JEFFERSON HWY,RIVER RIDGE,LA,70123,https://www.cvs.com/store-locator/details-dire...,True,True,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,29.957522,-90.217062
1,"CVS Pharmacy, Inc. #2113",(215) 836-1323,1600 WADSWORTH AVENUE,PHILADELPHIA,PA,19150,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COVID-19 Vaccine, 10mcg/0.3 m...",False,2024-08-01,40.080415,-75.171958
2,"CVS Pharmacy, Inc. #16504",(619) 315-0016,2911 JAMACHA RD,EL CAJON,CA,92019,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",True,2024-08-01,32.738217,-116.938327
3,Rite Aid #RA105255,253-851-6939,"4818 POINT FOSDICK DR.,NW",GIG HARBOR,WA,98335-1711,https://www.riteaid.com,True,False,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,47.302477,-122.581456
4,Walmart Inc #10-1447,7153929520.0,3705 Tower Ave,Superior,WI,54880-5338,https://www.walmart.com/store/1447,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,46.694518,-92.106958


In [6]:
#Check data types
vaccine_providers_df.dtypes

Location Name          object
Phone Number           object
Street Address         object
City                   object
State                  object
Zip Code               object
Website                object
Insurance Accepted     object
Walk-Ins Accepted      object
Types of Vaccine       object
In Stock                 bool
Last Updated           object
Latitude              float64
Longitude             float64
dtype: object

In [7]:
#Convert all phone numbers to same format
#Convert phone numbers with a decimal to dashes
def convert_phone_number(phone):
    pattern = r"(\d{3})(\d{3})(\d{4})\.0"
    replacement = r"\1-\2-\3"
    return re.sub(pattern, replacement, phone)

# Apply the function to the 'Phone Number' column
vaccine_providers_df['Phone Number'] = vaccine_providers_df['Phone Number'].apply(convert_phone_number)

#Convert phone numbers from having (Areacode in parentheses) to 314-123-4567 format.
def convert_phone_number(phone):
    pattern = r"\((\d{3})\) (\d{3})-(\d{4})"
    replacement = r"\1-\2-\3"
    return re.sub(pattern, replacement, phone)

# Apply the function to the 'Phone Number' column
vaccine_providers_df['Phone Number'] = vaccine_providers_df['Phone Number'].apply(convert_phone_number)

#Convert phone numbers from 3141234567 to 314-123-4567 format.
def convert_phone_number(phone):
    pattern = r"(\d{3})(\d{3})(\d{4})"
    replacement = r"\1-\2-\3"
    return re.sub(pattern, replacement, phone)

# Apply the function to the 'Phone Number' column
vaccine_providers_df['Phone Number'] = vaccine_providers_df['Phone Number'].apply(convert_phone_number)

vaccine_providers_df.head()

Unnamed: 0,Location Name,Phone Number,Street Address,City,State,Zip Code,Website,Insurance Accepted,Walk-Ins Accepted,Types of Vaccine,In Stock,Last Updated,Latitude,Longitude
0,"CVS Pharmacy, Inc. #5340",504-737-6242,9643-B JEFFERSON HWY,RIVER RIDGE,LA,70123,https://www.cvs.com/store-locator/details-dire...,True,True,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,29.957522,-90.217062
1,"CVS Pharmacy, Inc. #2113",215-836-1323,1600 WADSWORTH AVENUE,PHILADELPHIA,PA,19150,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COVID-19 Vaccine, 10mcg/0.3 m...",False,2024-08-01,40.080415,-75.171958
2,"CVS Pharmacy, Inc. #16504",619-315-0016,2911 JAMACHA RD,EL CAJON,CA,92019,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",True,2024-08-01,32.738217,-116.938327
3,Rite Aid #RA105255,253-851-6939,"4818 POINT FOSDICK DR.,NW",GIG HARBOR,WA,98335-1711,https://www.riteaid.com,True,False,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,47.302477,-122.581456
4,Walmart Inc #10-1447,715-392-9520,3705 Tower Ave,Superior,WI,54880-5338,https://www.walmart.com/store/1447,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,46.694518,-92.106958


In [8]:
#Clean all zip codes to be 5 digits only
#Fix the zipcode to have five digits
def fix_zipcode(zipcode):
    match = re.match(r'(\d{5})-\d{4}', zipcode)
    if match:
        return match.group(1)
vaccine_providers_df['Zip Code'] = vaccine_providers_df['Zip Code'].apply(fix_zipcode)

# Display the updated DataFrame
vaccine_providers_df

Unnamed: 0,Location Name,Phone Number,Street Address,City,State,Zip Code,Website,Insurance Accepted,Walk-Ins Accepted,Types of Vaccine,In Stock,Last Updated,Latitude,Longitude
0,"CVS Pharmacy, Inc. #5340",504-737-6242,9643-B JEFFERSON HWY,RIVER RIDGE,LA,,https://www.cvs.com/store-locator/details-dire...,True,True,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,29.957522,-90.217062
1,"CVS Pharmacy, Inc. #2113",215-836-1323,1600 WADSWORTH AVENUE,PHILADELPHIA,PA,,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COVID-19 Vaccine, 10mcg/0.3 m...",False,2024-08-01,40.080415,-75.171958
2,"CVS Pharmacy, Inc. #16504",619-315-0016,2911 JAMACHA RD,EL CAJON,CA,,https://www.cvs.com/store-locator/details-dire...,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",True,2024-08-01,32.738217,-116.938327
3,Rite Aid #RA105255,253-851-6939,"4818 POINT FOSDICK DR.,NW",GIG HARBOR,WA,98335,https://www.riteaid.com,True,False,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,47.302477,-122.581456
4,Walmart Inc #10-1447,715-392-9520,3705 Tower Ave,Superior,WI,54880,https://www.walmart.com/store/1447,True,True,"Pfizer-BioNTech, COMIRNATY, 30mcg/0.3 mL, syringe",False,2024-08-01,46.694518,-92.106958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135780,SAV-ON PHARMACY #3739,215-942-4894,105 E STREET RD,Feasterville Trevose,PA,,https://www.acmemarkets.com/vaccinations/home,True,True,"Pfizer-BioNTech, COVID-19 Vaccine, 3mcg/0.3 mL...",False,2024-08-01,40.147454,-74.999655
135781,Rite Aid #RA101653,603-532-6955,14 PETERBOROUGH STREET,JAFFREY,NH,03452,https://www.riteaid.com,True,False,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,42.814691,-72.021353
135782,"CVS Pharmacy, Inc. #4378",540-989-6633,4400 BRAMBLETON AVE,ROANOKE,VA,,https://www.cvs.com/store-locator/details-dire...,True,True,"Novavax, COVID-19 Vaccine, 5 mcg/0.5 mL, 5 dose",False,2024-08-01,37.224879,-80.011439
135783,Rite Aid #RA101963,724-438-9799,262 CONNELLSVILLE STREET,UNIONTOWN,PA,15401,https://www.riteaid.com,True,False,"Moderna, SPIKEVAX, 50 mcg/0.5 mL, syringe",False,2024-08-01,39.900490,-79.706902


In [9]:
#Convert the insuracne and walk-ins columns to booleans
def convert_Insurance_to_boolean(value):
    if value in ['True', True]:
        return True
    else:
        return False
vaccine_providers_df['Insurance Accepted'] = vaccine_providers_df['Insurance Accepted'].apply(convert_Insurance_to_boolean)

def convert_Walk_Ins_to_boolean(value):
    if value in ['True', True]:
        return True
    else:
        return False
vaccine_providers_df['Walk-Ins Accepted'] = vaccine_providers_df['Walk-Ins Accepted'].apply(convert_Walk_Ins_to_boolean)


In [10]:
#Check data types for all columns
vaccine_providers_df.dtypes

Location Name          object
Phone Number           object
Street Address         object
City                   object
State                  object
Zip Code               object
Website                object
Insurance Accepted       bool
Walk-Ins Accepted        bool
Types of Vaccine       object
In Stock                 bool
Last Updated           object
Latitude              float64
Longitude             float64
dtype: object

In [11]:
#Save DataFrame to a csv
vaccine_providers_df.to_csv("Vaccine Providers.csv")

In [12]:
#Save DataFrame to a csv
vaccine_providers_df.to_json("providers.json")

In [13]:
# Check for NaN values
nan_count = vaccine_providers_df.isnull().sum()

# Display the count of NaN values for each column
print("Count of NaN values in each column:")
print(nan_count)

# To get the locations of NaN values
nan_locations = vaccine_providers_df.isnull()
print("\nLocations of NaN values:")
print(nan_locations)

Count of NaN values in each column:
Location Name             0
Phone Number              0
Street Address            0
City                      0
State                     0
Zip Code              91548
Website                1068
Insurance Accepted        0
Walk-Ins Accepted         0
Types of Vaccine          0
In Stock                  0
Last Updated              0
Latitude                  8
Longitude                 8
dtype: int64

Locations of NaN values:
        Location Name  Phone Number  Street Address   City  State  Zip Code  \
0               False         False           False  False  False      True   
1               False         False           False  False  False      True   
2               False         False           False  False  False      True   
3               False         False           False  False  False     False   
4               False         False           False  False  False     False   
...               ...           ...             ...    ... 

In [17]:
zipcode_drop_df = vaccine_providers_df.drop(columns=['Zip Code'])
zipcode_drop_df.drop_duplicates(inplace=True)
print(zipcode_drop_df)

                    Location Name  Phone Number                Street Address  \
0        CVS Pharmacy, Inc. #5340  504-737-6242          9643-B JEFFERSON HWY   
1        CVS Pharmacy, Inc. #2113  215-836-1323         1600 WADSWORTH AVENUE   
2       CVS Pharmacy, Inc. #16504  619-315-0016               2911 JAMACHA RD   
3              Rite Aid #RA105255  253-851-6939     4818 POINT FOSDICK DR.,NW   
4            Walmart Inc #10-1447  715-392-9520                3705 Tower Ave   
...                           ...           ...                           ...   
135780      SAV-ON PHARMACY #3739  215-942-4894               105 E STREET RD   
135781         Rite Aid #RA101653  603-532-6955        14 PETERBOROUGH STREET   
135782   CVS Pharmacy, Inc. #4378  540-989-6633           4400 BRAMBLETON AVE   
135783         Rite Aid #RA101963  724-438-9799      262 CONNELLSVILLE STREET   
135784        VONS PHARMACY #1962  951-695-7273  29530 RANCHO CALIFORNIA ROAD   

                        Cit

In [15]:
nozipcode_latlng_dropna_df = zipcode_drop_df.dropna()

# Display the cleaned DataFrame
print("\nDataFrame after removing rows with null values:")
print(nozipcode_latlng_dropna_df)


DataFrame after removing rows with null values:
                    Location Name  Phone Number                Street Address  \
0        CVS Pharmacy, Inc. #5340  504-737-6242          9643-B JEFFERSON HWY   
1        CVS Pharmacy, Inc. #2113  215-836-1323         1600 WADSWORTH AVENUE   
2       CVS Pharmacy, Inc. #16504  619-315-0016               2911 JAMACHA RD   
3              Rite Aid #RA105255  253-851-6939     4818 POINT FOSDICK DR.,NW   
4            Walmart Inc #10-1447  715-392-9520                3705 Tower Ave   
...                           ...           ...                           ...   
135780      SAV-ON PHARMACY #3739  215-942-4894               105 E STREET RD   
135781         Rite Aid #RA101653  603-532-6955        14 PETERBOROUGH STREET   
135782   CVS Pharmacy, Inc. #4378  540-989-6633           4400 BRAMBLETON AVE   
135783         Rite Aid #RA101963  724-438-9799      262 CONNELLSVILLE STREET   
135784        VONS PHARMACY #1962  951-695-7273  29530 RANCH

In [16]:
#Save DataFrame to a csv
nozipcode_latlng_dropna_df.to_csv("nozipcode.csv")

In [None]:
vaccine_providers_dropna_df = vaccine_providers_df.dropna()

# Display the cleaned DataFrame
print("\nDataFrame after removing rows with null values:")
print(vaccine_providers_dropna_df)

## Storing of Data in MongoDB

In the terminal, import data from csv using the following code: 
mongoImport --type csv -d vaccines_db -c vaccine_providers --headerline --drop Vaccine Providers.csv

In [None]:
# Module used to connect Python with MongoDB
from pymongo import MongoClient

In [None]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
mongo = MongoClient(port=27017)

# Define the 'vaccines_db' database in Mongo
db = mongo.vaccines_db

# Declare the collection
providers = db.vaccine_providers

In [None]:
#Check that the data has been stored by making a query from the collection
location = providers.find()

print(location[0])