# Regex In Pandas

In [1]:
import pandas as pd

In [2]:
# https://data.medicare.gov/Nursing-Home-Compare/Skilled-Nursing-Facility-Quality-Reporting-Program/fykj-qjee
df = pd.read_csv('Skilled_Nursing_Facility_Quality_Reporting_Program_-_Provider_Data.csv', index_col=[0, 10])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Facility Name,Address Line 1,Address Line 2,City,State,Zip Code,County Name,Phone Number,CMS Region,Score,Footnote,no_QRP flag,Start Date,End Date,Location 1
CMS Certification Number (CCN),Measure Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
235383,S_013_01_NUMERATOR,ST JOSEPH'S HEALTHCARE CENTER,9400 CONANT STREET,,HAMTRAMCK,MI,48212,Wayne,(313)874-4500,5,0,,,07/01/2017,06/30/2018,"9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397..."
235383,S_006_01_MSPB_SCORE,ST JOSEPH'S HEALTHCARE CENTER,9400 CONANT STREET,,HAMTRAMCK,MI,48212,Wayne,(313)874-4500,5,1.09,,,10/01/2016,09/30/2017,"9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397..."
235379,S_004_01_PPR_PD_COMP_PERF,MEDILODGE OF TAWAS CITY,400 NORTH STREET WEST,,TAWAS CITY,MI,48763,Iosco,(989)362-8645,5,Not Available,16.0,,10/01/2015,09/30/2017,"400 NORTH STREET WEST TAWAS CITY, MI 48763 (44..."
235414,S_005_01_DTC_RS_RATE,THE VILLA AT WEST BRANCH,445 S VALLEY ST,,WEST BRANCH,MI,48661,Ogemaw,(989)345-3600,5,54.69,,,10/01/2016,09/30/2017,"445 S VALLEY ST WEST BRANCH, MI 48661 (44.2726..."
235431,S_004_01_PPR_PD_COMP_PERF,SANCTUARY AT THE PARK,570 S HARVEY STREET,,MUSKEGON,MI,49442,Muskegon,(231)672-2202,5,Not Available,16.0,,10/01/2015,09/30/2017,"570 S HARVEY STREET MUSKEGON, MI 49442 (43.241..."


In [3]:
df.loc[:, 'Location 1'].head()

CMS Certification Number (CCN)  Measure Code             
235383                          S_013_01_NUMERATOR           9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397...
                                S_006_01_MSPB_SCORE          9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397...
235379                          S_004_01_PPR_PD_COMP_PERF    400 NORTH STREET WEST TAWAS CITY, MI 48763 (44...
235414                          S_005_01_DTC_RS_RATE         445 S VALLEY ST WEST BRANCH, MI 48661 (44.2726...
235431                          S_004_01_PPR_PD_COMP_PERF    570 S HARVEY STREET MUSKEGON, MI 49442 (43.241...
Name: Location 1, dtype: object

In [4]:
df.loc[:, 'Location 1'].str[-25:].head()  # .str method gives access to string in column

CMS Certification Number (CCN)  Measure Code             
235383                          S_013_01_NUMERATOR           2 (42.397239, -83.049168)
                                S_006_01_MSPB_SCORE          2 (42.397239, -83.049168)
235379                          S_004_01_PPR_PD_COMP_PERF    3 (44.272532, -83.520777)
235414                          S_005_01_DTC_RS_RATE         1 (44.272658, -84.230672)
235431                          S_004_01_PPR_PD_COMP_PERF    2 (43.241767, -86.204022)
Name: Location 1, dtype: object

In [5]:
# df.loc[:, 'Location 1'].str. / cat / replace / upper / lower / split / ...

In [6]:
pattern = r'\((?P<lat>[^,]+), (?P<long>.+)\)'
lat_long_df = df.loc[:, 'Location 1'].str.extract(pattern)
lat_long_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,long
CMS Certification Number (CCN),Measure Code,Unnamed: 2_level_1,Unnamed: 3_level_1
235383,S_013_01_NUMERATOR,42.397239,-83.049168
235383,S_006_01_MSPB_SCORE,42.397239,-83.049168
235379,S_004_01_PPR_PD_COMP_PERF,44.272532,-83.520777
235414,S_005_01_DTC_RS_RATE,44.272658,-84.230672
235431,S_004_01_PPR_PD_COMP_PERF,43.241767,-86.204022


In [7]:
lat_long_df.info()  # columns are strings...

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 403988 entries, (235383, S_013_01_NUMERATOR) to (455484, S_004_01_PPR_PD_COMP_PERF)
Data columns (total 2 columns):
lat     374426 non-null object
long    374426 non-null object
dtypes: object(2)
memory usage: 7.4+ MB


In [8]:
lat_long_df.astype('float').info()  # lets cast to floats so they can be used as lat, long in other operations

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 403988 entries, (235383, S_013_01_NUMERATOR) to (455484, S_004_01_PPR_PD_COMP_PERF)
Data columns (total 2 columns):
lat     374426 non-null float64
long    374426 non-null float64
dtypes: float64(2)
memory usage: 7.4+ MB


In [9]:
df2 = df.join(lat_long_df.astype('float'))  # We'll get to joining DataFrames soon
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Facility Name,Address Line 1,Address Line 2,City,State,Zip Code,County Name,Phone Number,CMS Region,Score,Footnote,no_QRP flag,Start Date,End Date,Location 1,lat,long
CMS Certification Number (CCN),Measure Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
235383,S_013_01_NUMERATOR,ST JOSEPH'S HEALTHCARE CENTER,9400 CONANT STREET,,HAMTRAMCK,MI,48212,Wayne,(313)874-4500,5,0,,,07/01/2017,06/30/2018,"9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397...",42.397239,-83.049168
235383,S_006_01_MSPB_SCORE,ST JOSEPH'S HEALTHCARE CENTER,9400 CONANT STREET,,HAMTRAMCK,MI,48212,Wayne,(313)874-4500,5,1.09,,,10/01/2016,09/30/2017,"9400 CONANT STREET HAMTRAMCK, MI 48212 (42.397...",42.397239,-83.049168
235379,S_004_01_PPR_PD_COMP_PERF,MEDILODGE OF TAWAS CITY,400 NORTH STREET WEST,,TAWAS CITY,MI,48763,Iosco,(989)362-8645,5,Not Available,16.0,,10/01/2015,09/30/2017,"400 NORTH STREET WEST TAWAS CITY, MI 48763 (44...",44.272532,-83.520777
235414,S_005_01_DTC_RS_RATE,THE VILLA AT WEST BRANCH,445 S VALLEY ST,,WEST BRANCH,MI,48661,Ogemaw,(989)345-3600,5,54.69,,,10/01/2016,09/30/2017,"445 S VALLEY ST WEST BRANCH, MI 48661 (44.2726...",44.272658,-84.230672
235431,S_004_01_PPR_PD_COMP_PERF,SANCTUARY AT THE PARK,570 S HARVEY STREET,,MUSKEGON,MI,49442,Muskegon,(231)672-2202,5,Not Available,16.0,,10/01/2015,09/30/2017,"570 S HARVEY STREET MUSKEGON, MI 49442 (43.241...",43.241767,-86.204022


In [10]:
%matplotlib inline

import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap  
import numpy as np

KeyError: 'PROJ_LIB'

In [11]:
# KeyError: 'PROJ_LIB'? 
# https://github.com/matplotlib/basemap/issues/419#issuecomment-418155703

In [12]:
plt.figure(figsize=(15, 15))

m = Basemap(
    projection='mill', 
    lat_0=36, 
    lon_0=-95, 
    llcrnrlon=-125,
    llcrnrlat=25,
    urcrnrlon=-67,
    urcrnrlat=50,
    resolution='l',
)
m.drawcoastlines(linewidth=0.25)
m.drawcountries(linewidth=0.25)

lat = np.array(df2.loc[:, 'lat'])
long = np.array(df2.loc[:, 'long'])

x, y = m(long, lat)

plt.scatter(
    x, 
    y, 
    marker='o', 
    color='r',
    s=1,
)

NameError: name 'Basemap' is not defined

<Figure size 1080x1080 with 0 Axes>