# Data Transformation

## US Clinical Provider 2009

**Source**: National Plan and Provider Enumeration System (NPPES) | NBER

**Data set**: Provider.csv
        

In [1]:
# Import necessary libraries
import pandas as pd
import re
from csv import QUOTE_ALL

* Filter only the necessary data

In [2]:
# Read csv file
provider = pd.read_csv('data\Extract2009-FullDoubleQuotes\Provider.csv')

# Filter data only for US
data = provider[(provider['Country']=='US')]
print(data.shape)
display(data)

(7541570, 6)


Unnamed: 0,Provider,Individual,Zip,City,State,Country
0,1003000100,True,900053200,LOS ANGELES,CA,US
1,1003000118,False,285602304,NEW BERN,NC,US
2,1003000126,True,208171841,BETHESDA,MD,US
3,1003000134,True,602011718,EVANSTON,IL,US
4,1003000142,True,436233536,TOLEDO,OH,US
...,...,...,...,...,...,...
7794256,1992999858,True,136025438,FORT DRUM,NY,US
7794257,1992999866,True,331624505,NORTH MIAMI BEACH,FL,US
7794258,1992999874,True,232270000,RICHMOND,VA,US
7794259,1992999882,True,187640999,WILKES BARRE,PA,US


**NOTE**: The original shape of data frame is 
    7541570 rows by 6 columns

* Check special character in Country, State and City field of Provider.csv file

In [3]:
# Function to check special character from given text
def check_special_char(text):
    special_char ="".join(ch for ch in text if (ch.isalnum()==False and ch.isspace()==False))
    if len(special_char) > 0:
        return special_char
    else:
        return None

# Test
print(check_special_char('\/WALDEN'))
print(check_special_char('LOS ANGELES	'))

data.loc[0:,'SpecialChar'] = data.loc[:,'City'].astype(str).apply(check_special_char)
print('City having special character:', data.loc[data['SpecialChar'].isna()==False].count())
data.loc[data['SpecialChar'].isna()==False]

\/
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[0:,'SpecialChar'] = data.loc[:,'City'].astype(str).apply(check_special_char)


City having special character: Provider       8720
Individual     8720
Zip            8720
City           8720
State          8720
Country        8720
SpecialChar    8720
dtype: int64


Unnamed: 0,Provider,Individual,Zip,City,State,Country,SpecialChar
1585,1003015975,True,631416340,ST. LOUIS,MO,US,.
3342,1003033572,True,631310000,ST. LOUIS,MO,US,.
3808,1003038241,False,551040000,ST. PAUL,MN,US,.
3827,1003038431,True,857074405,DAVIS-MONTHAN AFB,AZ,US,-
4313,1003043290,False,601180000,W. DUNDEE,IL,US,.
...,...,...,...,...,...,...,...
7789947,1992956684,True,333080000,FT. LAUDERDALE,FL,US,.
7790346,1992960686,False,967920000,WAI'ANAE,HI,US,'
7790475,1992961973,False,329040000,W. MELBOURNE,FL,US,.
7792696,1992984231,True,554162699,ST. LOUIS PARK,MN,US,.


There are total of **8720 records** that has special character

* Count occurrence of each special character

In [4]:
special_char = data.groupby('SpecialChar').agg({'City': 'count'}).reset_index().sort_values(by='City', ascending=False)
special_char

Unnamed: 0,SpecialChar,City
21,.,5802
16,-,1516
10,",",680
2,',428
25,..,80
18,--,41
5,(),32
29,/,25
1,&,21
22,.',14


* Get the list of special characters

In [5]:
special_chars = special_char['SpecialChar'].str.cat()
special_chars_list = ''.join(set(special_chars))
print('Number of special characters: {}'.format(len(special_chars_list)))
special_chars_list

Number of special characters: 11


"/#.(;':&)-,"

In [6]:
# Function to check text in original City field and print details
def check_city(data, text, rows_to_display=5):
    test_df = data[data['Original_City'].str.contains(text, regex=False, na=False)]
    print('\nCity having {}:\n{}'.format(text, test_df.head(rows_to_display)))

# Function to clean special character in City and print details
def clean_city(data, old_value, new_value, regexFlag):
    if regexFlag:
        data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)
    else:
        data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)

* Copy City to a new column 'Original_City'

In [7]:
data['Original_City'] = data['City']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Original_City'] = data['City']


### Clean and Transform special character

1. Check [.]
    * Check City with [.]

In [8]:
# Adjust display width
pd.set_option('display.width', 1000)

# Investigate special character format
special_char = '.'
print('Special Character:', special_char)

check_city(data, special_char)

check_city(data, 'LOUIS', 5)

check_city(data, 'DUNDEE', 5)

Special Character: .

City having .:
        Provider  Individual        Zip          City State Country SpecialChar Original_City
1585  1003015975        True  631416340     ST. LOUIS    MO      US           .     ST. LOUIS
3342  1003033572        True  631310000     ST. LOUIS    MO      US           .     ST. LOUIS
3808  1003038241       False  551040000      ST. PAUL    MN      US           .      ST. PAUL
4313  1003043290       False  601180000     W. DUNDEE    IL      US           .     W. DUNDEE
5474  1003054925        True  422230000  FT. CAMPBELL    KY      US           .  FT. CAMPBELL

City having LOUIS:
       Provider  Individual        Zip         City State Country SpecialChar Original_City
80   1003000902        True  402121033   LOUISVILLE    KY      US        None    LOUISVILLE
140  1003001504        True  631390000  SAINT LOUIS    MO      US        None   SAINT LOUIS
325  1003003351       False  402455291   LOUISVILLE    KY      US        None    LOUISVILLE
337  100300

1. Clean [.]
    * Removing [.] by replacing empty character -> []
    * 'ST ' -> 'SAINT '
    * 'FT CAMPBELL' -> 'FORT CAMPBELL'
    * 'W DUNDEE' -> 'WEST DUNDEE'
    * 'E DUNDEE' -> 'EAST DUNDEE'
    * 'N DUNDEE' -> 'NORTH DUNDEE'
    * 'S DUNDEE' -> 'SOUTH DUNDEE'

In [9]:
special_char = '.'
clean_city(data, special_char, '', False)

# ST  -> SAINT 
clean_city(data, '\s*ST\s+', 'SAINT ', True)

# FT CAMPBELL -> FORT CAMPBELL
clean_city(data, '\s*FT\s+CAMPBELL', 'FORT CAMPBELL', True)

# W DUNDEE -> WEST DUNDEE
clean_city(data, '\s*W\s+DUNDEE', 'WEST DUNDEE', True)
clean_city(data, '\s*E\s+DUNDEE', 'EAST DUNDEE', True)
clean_city(data, '\s*N\s+DUNDEE', 'NORTH DUNDEE', True)
clean_city(data, '\s*S\s+DUNDEE', 'SOUTH DUNDEE', True)

print('Check .')
check_city(data, special_char)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)


Check .

City having .:
        Provider  Individual        Zip           City State Country SpecialChar Original_City
1585  1003015975        True  631416340    SAINT LOUIS    MO      US           .     ST. LOUIS
3342  1003033572        True  631310000    SAINT LOUIS    MO      US           .     ST. LOUIS
3808  1003038241       False  551040000     SAINT PAUL    MN      US           .      ST. PAUL
4313  1003043290       False  601180000    WEST DUNDEE    IL      US           .     W. DUNDEE
5474  1003054925        True  422230000  FORT CAMPBELL    KY      US           .  FT. CAMPBELL


2. Check [-]
    * Check City with [-]

In [10]:
# Investigate special character format
special_char = '-'
print('Special Character:', special_char)

check_city(data, special_char)

check_city(data, 'MONTHAN')

Special Character: -

City having -:
         Provider  Individual        Zip                       City State Country SpecialChar              Original_City
3827   1003038431        True  857074405          DAVIS-MONTHAN AFB    AZ      US           -          DAVIS-MONTHAN AFB
10126  1003101551        True  236652040  JOINT BASE LANGLEY-EUSTIS    VA      US           -  JOINT BASE LANGLEY-EUSTIS
14162  1003142076        True  271012933              WINSTON-SALEM    NC      US           -              WINSTON-SALEM
21332  1003213919        True  187020000               WILKES-BARRE    PA      US           -               WILKES-BARRE
26374  1003264508        True  782365313              JBSA-LACKLAND    TX      US           -              JBSA-LACKLAND

City having MONTHAN:
          Provider  Individual        Zip               City State Country SpecialChar      Original_City
3827    1003038431        True  857074405  DAVIS-MONTHAN AFB    AZ      US           -  DAVIS-MONTHAN AFB
947

2. Clean [-]
    * Remove [-] by replacing space character -> [ ]

In [11]:
special_char = '-'
clean_city(data, special_char, ' ', False)

check_city(data, special_char)

check_city(data, 'MONTHAN')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having -:
         Provider  Individual        Zip                       City State Country SpecialChar              Original_City
3827   1003038431        True  857074405          DAVIS MONTHAN AFB    AZ      US           -          DAVIS-MONTHAN AFB
10126  1003101551        True  236652040  JOINT BASE LANGLEY EUSTIS    VA      US           -  JOINT BASE LANGLEY-EUSTIS
14162  1003142076        True  271012933              WINSTON SALEM    NC      US           -              WINSTON-SALEM
21332  1003213919        True  187020000               WILKES BARRE    PA      US           -               WILKES-BARRE
26374  1003264508        True  782365313              JBSA LACKLAND    TX      US           -              JBSA-LACKLAND

City having MONTHAN:
          Provider  Individual        Zip               City State Country SpecialChar      Original_City
3827    1003038431        True  857074405  DAVIS MONTHAN AFB    AZ      US           -  DAVIS-MONTHAN AFB
94789   1013168418      

3. Check [()]
    * Check City with [()]

In [12]:
# Investigate special character format
special_char = '('
print('Special Character:', special_char)

check_city(data, '(')

check_city(data, 'AMBLER')

# Investigate special character format
special_char = ')'
print('Special Character:', special_char)

Special Character: (

City having (:
           Provider  Individual        Zip                      City State Country SpecialChar             Original_City
201220   1023455094        True  190020000     AMBLER (LOWER GWYNED)    PA      US          ()     AMBLER (LOWER GWYNED)
547603   1073023347        True  180460000  EASAINT TEXAS (MACUNGIE)    PA      US          ()     EAST TEXAS (MACUNGIE)
597213   1073737458       False  328190000     ORLANDO (TURKEY LAKE)    FL      US          ()     ORLANDO (TURKEY LAKE)
774623   1093951956       False  321590000  LADY LAKE (THE VILLAGES)    FL      US          ()  LADY LAKE (THE VILLAGES)
1276351  1164514873       False  787530000               AUSTIN (NW)    TX      US          ()               AUSTIN (NW)

City having AMBLER:
          Provider  Individual        Zip    City State Country SpecialChar Original_City
11693   1003117342       False  190024425  AMBLER    PA      US        None        AMBLER
105851  1013279389       False  1900

3. Clean [()]
    * Remove [()] including the text within the parenthesis

In [13]:
# Remove () including anything inside the parenthesis
clean_city(data, '\s*\(.+\)', '', True)

# Delete ( special character
special_char = '('
clean_city(data, special_char, '', False)

# Delete ) special character
special_char = ')'
clean_city(data, special_char, '', False)

check_city(data, special_char)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having ):
           Provider  Individual        Zip           City State Country SpecialChar             Original_City
201220   1023455094        True  190020000         AMBLER    PA      US          ()     AMBLER (LOWER GWYNED)
547603   1073023347        True  180460000  EASAINT TEXAS    PA      US          ()     EAST TEXAS (MACUNGIE)
597213   1073737458       False  328190000        ORLANDO    FL      US          ()     ORLANDO (TURKEY LAKE)
774623   1093951956       False  321590000      LADY LAKE    FL      US          ()  LADY LAKE (THE VILLAGES)
1276351  1164514873       False  787530000         AUSTIN    TX      US          ()               AUSTIN (NW)


4. Check [;]
    * Check City with [;]

In [14]:
# Investigate special character format
special_char = ';'
print('Special Character:', special_char)

check_city(data, special_char, 10)

Special Character: ;

City having ;:
           Provider  Individual        Zip                 City State Country SpecialChar     Original_City
175626   1023197456        True  142630000              BUFFA;P    NY      US           ;           BUFFA;P
1565218  1205063856       False  300220000           A;PHARETTA    GA      US           ;        A;PHARETTA
2287088  1295270676        True  949010000         ; SAN RAFAEL    CA      US           ;      ; SAN RAFAEL
3353173  1437112166        True  483232184  WESAINT BLOOMFIEL;D    MI      US           ;  WEST BLOOMFIEL;D
3468088  1447583109        True  960800000            RED B;UFF    CA      US           ;         RED B;UFF
5166101  1669427183        True  276148292              RA;EIGH    NC      US           ;           RA;EIGH


4. Clean [;]
    * Remove [;] -> []

In [15]:
# Delete ; special character
special_char = ';'
clean_city(data, special_char, '', False)

check_city(data, special_char)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having ;:
           Provider  Individual        Zip                City State Country SpecialChar     Original_City
175626   1023197456        True  142630000              BUFFAP    NY      US           ;           BUFFA;P
1565218  1205063856       False  300220000           APHARETTA    GA      US           ;        A;PHARETTA
2287088  1295270676        True  949010000          SAN RAFAEL    CA      US           ;      ; SAN RAFAEL
3353173  1437112166        True  483232184  WESAINT BLOOMFIELD    MI      US           ;  WEST BLOOMFIEL;D
3468088  1447583109        True  960800000            RED BUFF    CA      US           ;         RED B;UFF


5. Check [&]
    * Check City with [&]

In [16]:
# Investigate special character format
special_char = '&'
print('Special Character:', special_char)

check_city(data, special_char)

check_city(data, 'TOWN AND COUNTRY')

Special Character: &

City having &:
           Provider  Individual        Zip            City State Country SpecialChar   Original_City
311834   1043206584       False  630175812  TOWN & COUNTRY    MO      US           &  TOWN & COUNTRY
370740   1043799844        True  631311640  TOWN & COUNTRY    MO      US           &  TOWN & COUNTRY
1331535  1174069900       False  630170000  TOWN & COUNTRY    MO      US           &  TOWN & COUNTRY
1360636  1174577803       False  630170000  TOWN & COUNTRY    MO      US           &  TOWN & COUNTRY
1565775  1205069432       False  631310000  TOWN & COUNTRY    MO      US           &  TOWN & COUNTRY

City having TOWN AND COUNTRY:
          Provider  Individual        Zip              City State Country SpecialChar     Original_City
136155  1013585033        True  630175734  TOWN AND COUNTRY    MO      US        None  TOWN AND COUNTRY
144108  1013669944       False  630178400  TOWN AND COUNTRY    MO      US        None  TOWN AND COUNTRY
249602  103325

5. Clean [&]
    * Replace [&] -> 'AND'

In [17]:
# Replace & with AND
special_char = '&'
clean_city(data, special_char, 'AND', False)

check_city(data, special_char, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having &:
           Provider  Individual        Zip              City State Country SpecialChar   Original_City
311834   1043206584       False  630175812  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
370740   1043799844        True  631311640  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
1331535  1174069900       False  630170000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
1360636  1174577803       False  630170000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
1565775  1205069432       False  631310000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
1850289  1235678186        True  631310000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
2507315  1326130345       False  630170000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
2753332  1356465587        True  631310000  TOWN AND COUNTRY    MO      US           &  TOWN & COUNTRY
2761178  1356544209        True  630170000  TOWN AND COUN

6. Check [:]
    * Check City with [:]

In [18]:
# Investigate special character format
special_char = ':'
print('Special Character:', special_char)

check_city(data, special_char)

Special Character: :

City having ::
           Provider  Individual        Zip          City State Country SpecialChar Original_City
6459163  1821689753        True  444710000  1: STRUTHERS    OH      US           :  1: STRUTHERS


6. Clean [:]
    * Since we have only one record, we can decide that [:] can be removed including the text before it

In [19]:
# Remove : including anything before the colon
clean_city(data, '.+:\s*', '', True)

check_city(data, special_char)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)



City having ::
           Provider  Individual        Zip       City State Country SpecialChar Original_City
6459163  1821689753        True  444710000  STRUTHERS    OH      US           :  1: STRUTHERS


7. Check [/]
    * Check City with [/]

In [20]:
# Investigate special character format
special_char = '/'
print('Special Character:', special_char)

check_city(data, special_char, 10)

Special Character: /

City having /:
           Provider  Individual        Zip                    City State Country SpecialChar           Original_City
312389   1043212186        True  973940000               /WALDPORT    OR      US           /               /WALDPORT
531440   1063860211        True  968535399        JBPHH/HICKAM AFB    HI      US           /        JBPHH/HICKAM AFB
934111   1114986932       False  761270000  NAVAL AIR STATION/ JRB    TX      US           /  NAVAL AIR STATION/ JRB
1016357  1134129661        True  761271133  NAVAL AIR STATION/ JRB    TX      US           /  NAVAL AIR STATION/ JRB
1385336  1174825582        True  174030000   LEADER HEIGHTS / YORK    PA      US           /   LEADER HEIGHTS / YORK
1441107  1184600744        True  761271133  NAVAL AIR STATION/ JRB    TX      US           /  NAVAL AIR STATION/ JRB
1484355  1194035345        True  115800000        N/ VALLEY STREAM    NY      US          /.       N/. VALLEY STREAM
1614465  1205574639        

7. Clean [/]

From the data, [/] seems was used as delimeter for multiple cities.

* Clean spaces before and after / to eliminate unwanted space to the individual City when we split the City by /.
* Clean [/] at the beginning of City
* Clean [/] at the end of City
* Get the first item of split cities 
    From our SMS Prof Aaron, usually the major city is stated in the first City.

In [21]:
# Clean spaces around /
special_char = '/'
clean_city(data, '\s+/', '/', True)
clean_city(data, '/\s+', '/', True)
# Delete / at the beginning of city
clean_city(data, '^\s*/', '', True)
# Delete / at the end of city
clean_city(data, '/\s*$', '', True)

# Get first item from multiple cities
data['City'] = data['City'].str.split('/').str[0]

check_city(data, special_char)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.split('/').str[0]



City having /:
           Provider  Individual        Zip               City State Country SpecialChar           Original_City
312389   1043212186        True  973940000           WALDPORT    OR      US           /               /WALDPORT
531440   1063860211        True  968535399              JBPHH    HI      US           /        JBPHH/HICKAM AFB
934111   1114986932       False  761270000  NAVAL AIR STATION    TX      US           /  NAVAL AIR STATION/ JRB
1016357  1134129661        True  761271133  NAVAL AIR STATION    TX      US           /  NAVAL AIR STATION/ JRB
1385336  1174825582        True  174030000     LEADER HEIGHTS    PA      US           /   LEADER HEIGHTS / YORK


8. Check [,]
    * Check City with [,]

In [22]:
# Investigate special character format
special_char = ','
check_city(data, special_char)


City having ,:
         Provider  Individual        Zip                City State Country SpecialChar       Original_City
16736  1003167867        True  017026388         FRAMINGHAM,    MA      US           ,         FRAMINGHAM,
22422  1003224924       False  334460000    DELRAY BEACH, FL    FL      US           ,    DELRAY BEACH, FL
25018  1003250929        True  851320000           FLORENCE,    AZ      US           ,           FLORENCE,
43388  1003436783        True  103050000  STATEN ISLAND, NYC    NY      US           ,  STATEN ISLAND, NYC
57514  1003583667        True  067080000          WATERBURY,    CT      US           ,          WATERBURY,


8. The , as it was used as delimeter for complete address including the street.
    * City sometimes appear before , like 'PLAINVILLE, CT 06062'
    * But in some record, city appear after , like '150,BERGEN STREET,NEWARK'
    * Clean spaces before and after , to eliminate unwanted space to the individual City when we split the City by ,.
    * Clean [,] at the beginning of City
    * Clean [,] at the end of City
    * Get the first item of split Cities

In [23]:
# Clean space around ,
clean_city(data, '\s*,\s*', ',', True)
# Delete , at the beginning of city
clean_city(data, '^\s*,', '', True)
# Delete , at the end of city
clean_city(data, ',\s*$', '', True)
print('Special Character:', special_char)

# Get first item from multiple cities
data['City'] = data['City'].str.split(',').str[0]

check_city(data, special_char, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)


Special Character: ,


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.split(',').str[0]



City having ,:
          Provider  Individual        Zip           City State Country SpecialChar             Original_City
16736   1003167867        True  017026388     FRAMINGHAM    MA      US           ,               FRAMINGHAM,
22422   1003224924       False  334460000   DELRAY BEACH    FL      US           ,          DELRAY BEACH, FL
25018   1003250929        True  851320000       FLORENCE    AZ      US           ,                 FLORENCE,
43388   1003436783        True  103050000  STATEN ISLAND    NY      US           ,        STATEN ISLAND, NYC
57514   1003583667        True  067080000      WATERBURY    CT      US           ,                WATERBURY,
58479   1003803347        True  303421736        ATLANTA    GA      US           ,                  ATLANTA,
104920  1013270040        True  104570000          BRONX    NY      US           ,                    BRONX,
115947  1013381524        True  060620000     PLAINVILLE    NY      US           ,      PLAINVILLE, CT 06062
127

9. Check [#]
    * Check City with [#]

In [24]:
# Investigate special character format
special_char = '#'
print('Special Character:', special_char)

check_city(data, special_char)

Special Character: #

City having #:
           Provider  Individual        Zip              City State Country SpecialChar     Original_City
1055038  1134518798       False  325420000  EGLIN AFB FLD #3    FL      US           #  EGLIN AFB FLD #3


9. Fix [#]
    * Since we have only one record, we can decide that we can remove [#] including the text after it and the right city name for this is EGLIN AFB only

In [25]:
# Remove # including anything after the #
clean_city(data, '\s*#.*', '', True)
clean_city(data, 'EGLIN AFB FLD', 'EGLIN AFB', False)

check_city(data, special_char)

check_city(data, 'EGLIN')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].replace(to_replace=old_value, regex=regexFlag, value=new_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having #:
           Provider  Individual        Zip       City State Country SpecialChar     Original_City
1055038  1134518798       False  325420000  EGLIN AFB    FL      US           #  EGLIN AFB FLD #3

City having EGLIN:
         Provider  Individual        Zip       City State Country SpecialChar Original_City
22735  1003228057        True  325421302  EGLIN AFB    FL      US        None     EGLIN AFB
63122  1003850496        True  325421302  EGLIN AFB    FL      US        None     EGLIN AFB
79676  1013016773       False  325421302  EGLIN AFB    FL      US        None     EGLIN AFB
86760  1013087873        True  325421302  EGLIN AFB    FL      US        None     EGLIN AFB
87078  1013091073        True  325421282  EGLIN AFB    FL      US        None     EGLIN AFB


10. Check [']
    * Check City with [']

In [26]:
# Investigate special character format
special_char = '\''
print('Special Character:', special_char)

check_city(data, special_char, 10)

Special Character: '

City having ':
          Provider  Individual        Zip            City State Country SpecialChar   Original_City
35256   1003355298        True  838142668   COEUR D'ALENE    ID      US           '   COEUR D'ALENE
48252   1003485624        True  633660000        O'FALLON    MO      US           '        O'FALLON
61916   1003838400        True  244776712  STUART'S DRAFT    VA      US           '  STUART'S DRAFT
75535   1003975087       False  838142601   COEUR D'ALENE    ID      US           '   COEUR D'ALENE
83397   1013054170        True  633660000        O'FALLON    MO      US           '        O'FALLON
127510  1013498302        True  633660000        O'FALLON    MO      US           '        O'FALLON
131051  1013533819        True  622690000        O'FALLON    IL      US           '        O'FALLON
144367  1013672534        True  838140000   COEUR D'ALENE    ID      US           '   COEUR D'ALENE
183067  1023272127       False  499468125          L'ANSE    MI

10. Clean ['] 
    * Remove ['] ->[]

In [27]:
# Remove [']
clean_city(data, '\'', '', False)

check_city(data, '\'')

check_city(data, 'COEUR DALENE')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having ':
         Provider  Individual        Zip           City State Country SpecialChar   Original_City
35256  1003355298        True  838142668   COEUR DALENE    ID      US           '   COEUR D'ALENE
48252  1003485624        True  633660000        OFALLON    MO      US           '        O'FALLON
61916  1003838400        True  244776712  STUARTS DRAFT    VA      US           '  STUART'S DRAFT
75535  1003975087       False  838142601   COEUR DALENE    ID      US           '   COEUR D'ALENE
83397  1013054170        True  633660000        OFALLON    MO      US           '        O'FALLON

City having COEUR DALENE:
           Provider  Individual        Zip          City State Country SpecialChar Original_City
323089   1043320179        True  838150000  COEUR DALENE    ID      US        None  COEUR DALENE
588389   1073648945        True  838140000  COEUR DALENE    ID      US        None  COEUR DALENE
3682504  1477186138        True  838140000  COEUR DALENE    ID      US        

* Remove word CITY from City field

Some city values has 'CITY' suffix that makes it eliminate in matching records

In [28]:
# Remove [ CITY]
check_city(data, ' CITY', 10)

clean_city(data, ' CITY', '', False)

check_city(data, ' CITY')
check_city(data, 'NEW YORK')
check_city(data, 'NEW YORK CITY')


City having  CITY:
       Provider  Individual        Zip           City State Country SpecialChar  Original_City
31   1003000415        True  320251607      LAKE CITY    FL      US        None      LAKE CITY
47   1003000571       False  641282515    KANSAS CITY    MO      US        None    KANSAS CITY
190  1003002007        True  940621303   REDWOOD CITY    CA      US        None   REDWOOD CITY
231  1003002411        True  210433383  ELLICOTT CITY    MD      US        None  ELLICOTT CITY
373  1003003831       False  731182235  OKLAHOMA CITY    OK      US        None  OKLAHOMA CITY
388  1003003989        True  897031541    CARSON CITY    NV      US        None    CARSON CITY
394  1003004045        True  940631510   REDWOOD CITY    CA      US        None   REDWOOD CITY
521  1003005315        True  496846941  TRAVERSE CITY    MI      US        None  TRAVERSE CITY
733  1003007436        True  684101930  NEBRASKA CITY    NE      US        None  NEBRASKA CITY
764  1003007741        True  4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.replace(old_value,new_value, regex=regexFlag)



City having  CITY:
       Provider  Individual        Zip      City State Country SpecialChar  Original_City
31   1003000415        True  320251607      LAKE    FL      US        None      LAKE CITY
47   1003000571       False  641282515    KANSAS    MO      US        None    KANSAS CITY
190  1003002007        True  940621303   REDWOOD    CA      US        None   REDWOOD CITY
231  1003002411        True  210433383  ELLICOTT    MD      US        None  ELLICOTT CITY
373  1003003831       False  731182235  OKLAHOMA    OK      US        None  OKLAHOMA CITY

City having NEW YORK:
       Provider  Individual        Zip      City State Country SpecialChar Original_City
262  1003002726        True  100161032  NEW YORK    NY      US        None      NEW YORK
270  1003002809        True  100118664  NEW YORK    NY      US        None      NEW YORK
460  1003004706       False  100352745  NEW YORK    NY      US        None      NEW YORK
527  1003005372        True  100654870  NEW YORK    NY      U

* Remove spaces around City column value

In [29]:
data['City'] = data['City'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['City'] = data['City'].str.strip()


### Delete SpecialChar column

In [30]:
data.drop(columns='SpecialChar', inplace=True)
data.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns='SpecialChar', inplace=True)


Index(['Provider', 'Individual', 'Zip', 'City', 'State', 'Country', 'Original_City'], dtype='object')

### Save to new file Provider_clean.csv

In [31]:
data.to_csv(
    'Provider_clean.csv',
    index=False,
    quoting=QUOTE_ALL
)

### Summary of cleaning City
* .();:#' > delete
* \- > space
* & > AND
* / use as delimiter of multiple cities and get only the first item
* , use as delimiter of multiple details such as street and city and get only the first item
* Delete CITY word  for consistency