# Python Data Cleaning

### Import Necesities

In [138]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

### Load Data

In [139]:
raw_data = pd.read_csv('pythons.csv')

### Questions Data Will Answer

- Are sightings of Burmese pythons in the wild increasing? (Client wants an analysis with visualizations)

- Which three counties are most affected? (Analysis + visualization)

- Are there geographic hotspots? If so, can you estimate the likelihood that volunteers would find pythons if we sent patrols to those areas between today's date and Dec 31st? (Analysis + viz)

- When are people most likely to spot pythons, and why? (Analysis + viz)

- Are sightings cyclical? (Analysis + viz)

- How many python observations do you predict will be recorded for the full 2019 year? (time-series analysis)

- For purposes of developing a social media/citizen science campaign, the client would like to know:

 - Should we be using iNaturalist to get the public more engaged? (Back your answer with data)

 - Which iNaturalist users are most active in sighting pythons?

 - Which iNaturalist users are most active in identifying pythons? Hint: iNaturalist observations must be confirmed by other users...

 - Which iNaturalist users are most connected in the python-spotting community (i.e. who are the influencers)?

 - Is there overlap between the observers in the two datasets (i.e., are any wildlife officials also using iNaturalist)? (Hint: yes. Visualize it.)

### Check Data

In [140]:
raw_data.head()

Unnamed: 0,objectid,Reporter,ComName,SciName,OccStatus,ObsDate,DateAcc,DateEnt,DateUp,Location,Latitude,Longitude,Datum,CoordAcc,Method,DataType,LocalOwner,Habitat,Locality,Site,InfestAcre,GrossAcre,Abundance,Density,NumCollect,Percentcov,TreatArea,TreatComm,Quantity,QuantityU,TrapType,NumTraps,Comments,VisitType,CollectTme,Surveyor,RecSource,RecOwner,RecSrcTyp,OrigName,Nativity,Host,Host_Name,VerifyMthd,IDCred,Verified,Reviewer,ReviewDate,OrgSrcID,PID,Voucher,Museum,MuseumRec,Reference
0,8303498,Travis Mangione FWC,Burmese python,Python molurus ssp. bivittatus,Positive,24 Nov 2019,,26 Nov 2019,,"Miami-Dade, Florida, United States",25.76191,-80.74829,WGS84,,,,,,"Man found and killed the python ""just before t...",,,,,,,,,,,,,,Hunter found and killed python just outside of...,,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,05 Dec 2019,,,0.0,,,
1,8303147,Edward F. Metzger III,Burmese python,Python molurus ssp. bivittatus,Positive,20 Nov 2019,,22 Nov 2019,22 Nov 2019,"Broward, Florida, United States",26.29298,-80.50567,WGS84,0.0,Google Maps GPS,,,,,,,,,,,,,,,,,,Yearling roadkill,,5.0,,,Unknown,Android,,Introduced,,,Photographs,Verified,Verified,FWCC Exotic Species Database,05 Dec 2019,,,,,,
2,8298008,Michael Reupert NPS Big Cypress National Pres...,Burmese python,Python molurus ssp. bivittatus,Positive,07 Nov 2019,,12 Nov 2019,,"Collier, Florida, United States",25.86434,-81.10691,WGS84,,,,,Edge: Roadside ...,Hy 41 south in BICY,,,,,,,,,,,,,,Dispatch ed by BICY Python agents,,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,14 Nov 2019,,,0.0,HQ freezer ...,,
3,8295649,"matthew mccollister National Park Service, Big...",Burmese python,Python molurus ssp. bivittatus,Positive,06 Nov 2019,,07 Nov 2019,,"Collier, Florida, United States",25.83131,-80.90326,WGS84,,,,,,Big Cypress National Preserve,,,,,,,,,,,,,,"collected by T Hobbs, euthanized by NPS",,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,08 Nov 2019,,,0.0,,,
4,8295647,"matthew mccollister National Park Service, Big...",Burmese python,Python molurus ssp. bivittatus,Positive,05 Nov 2019,,07 Nov 2019,,"Collier, Florida, United States",25.87197,-81.18223,WGS84,,,,,,Big Cypress National Preserve,,,,,,,,,,,,,,"female, collected by P Hobbs, euthanized by NPS",,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,08 Nov 2019,,,0.0,,,


In [141]:
#Count values of columns to see value relvance

raw_data['Reviewer'].value_counts()

FWCC Exotic Species Database    671
Larry Connor, FWCC              176
Edward Mercer                     9
Tony Pernas                       5
Ashley Lawrence                   4
Larry Connor, FWCc                3
Jake Edwards                      2
Liz Barraco, FWCC                 2
Pat Howell                        2
Vanessa McDonough                 1
Karan A. Rawlins                  1
Name: Reviewer, dtype: int64

In [142]:
#Check rows where observations occured in Florida

raw_data[raw_data['Location'].str.contains('Florida')].head()

Unnamed: 0,objectid,Reporter,ComName,SciName,OccStatus,ObsDate,DateAcc,DateEnt,DateUp,Location,Latitude,Longitude,Datum,CoordAcc,Method,DataType,LocalOwner,Habitat,Locality,Site,InfestAcre,GrossAcre,Abundance,Density,NumCollect,Percentcov,TreatArea,TreatComm,Quantity,QuantityU,TrapType,NumTraps,Comments,VisitType,CollectTme,Surveyor,RecSource,RecOwner,RecSrcTyp,OrigName,Nativity,Host,Host_Name,VerifyMthd,IDCred,Verified,Reviewer,ReviewDate,OrgSrcID,PID,Voucher,Museum,MuseumRec,Reference
0,8303498,Travis Mangione FWC,Burmese python,Python molurus ssp. bivittatus,Positive,24 Nov 2019,,26 Nov 2019,,"Miami-Dade, Florida, United States",25.76191,-80.74829,WGS84,,,,,,"Man found and killed the python ""just before t...",,,,,,,,,,,,,,Hunter found and killed python just outside of...,,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,05 Dec 2019,,,0.0,,,
1,8303147,Edward F. Metzger III,Burmese python,Python molurus ssp. bivittatus,Positive,20 Nov 2019,,22 Nov 2019,22 Nov 2019,"Broward, Florida, United States",26.29298,-80.50567,WGS84,0.0,Google Maps GPS,,,,,,,,,,,,,,,,,,Yearling roadkill,,5.0,,,Unknown,Android,,Introduced,,,Photographs,Verified,Verified,FWCC Exotic Species Database,05 Dec 2019,,,,,,
2,8298008,Michael Reupert NPS Big Cypress National Pres...,Burmese python,Python molurus ssp. bivittatus,Positive,07 Nov 2019,,12 Nov 2019,,"Collier, Florida, United States",25.86434,-81.10691,WGS84,,,,,Edge: Roadside ...,Hy 41 south in BICY,,,,,,,,,,,,,,Dispatch ed by BICY Python agents,,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,14 Nov 2019,,,0.0,HQ freezer ...,,
3,8295649,"matthew mccollister National Park Service, Big...",Burmese python,Python molurus ssp. bivittatus,Positive,06 Nov 2019,,07 Nov 2019,,"Collier, Florida, United States",25.83131,-80.90326,WGS84,,,,,,Big Cypress National Preserve,,,,,,,,,,,,,,"collected by T Hobbs, euthanized by NPS",,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,08 Nov 2019,,,0.0,,,
4,8295647,"matthew mccollister National Park Service, Big...",Burmese python,Python molurus ssp. bivittatus,Positive,05 Nov 2019,,07 Nov 2019,,"Collier, Florida, United States",25.87197,-81.18223,WGS84,,,,,,Big Cypress National Preserve,,,,,,,,,,,,,,"female, collected by P Hobbs, euthanized by NPS",,,,,Unknown,Web Report,,Introduced,,,Reporter Expertise,Credible,Verified,FWCC Exotic Species Database,08 Nov 2019,,,0.0,,,


In [143]:
#Review location data

raw_data['Location'].unique()

array(['Miami-Dade, Florida, United States',
       'Broward, Florida, United States',
       'Collier, Florida, United States',
       'Monroe, Florida, United States',
       'Palm Beach, Florida, United States',
       'Polk, Florida, United States', 'Orange, Florida, United States',
       'Manatee, Florida, United States',
       'Sarasota, Florida, United States',
       'Volusia, Florida, United States', 'Lee, Florida, United States',
       'Glades, Florida, United States',
       'Okeechobee, Florida, United States',
       'Hendry, Florida, United States',
       'Alachua, Florida, United States',
       'Knox, Tennessee, United States',
       'Monroe, New York, United States',
       'Hernando, Florida, United States',
       'Marion, Florida, United States', 'Pasco, Florida, United States',
       'Pinellas, Florida, United States',
       'Highlands, Florida, United States',
       'Osceola, Florida, United States',
       'Charlotte, Florida, United States',
       'St. 

In [144]:
#Check columns with null values

raw_data.isnull().sum()

objectid         0
Reporter         0
ComName          0
SciName          0
OccStatus        0
ObsDate          0
DateAcc       4793
DateEnt          0
DateUp        3614
Location         0
Latitude         0
Longitude        0
Datum            0
CoordAcc      4506
Method        2695
DataType      2479
LocalOwner    4763
Habitat       4236
Locality      2296
Site          4797
InfestAcre    4681
GrossAcre     4760
Abundance     4798
Density       4798
NumCollect    1401
Percentcov    4797
TreatArea     4798
TreatComm     4798
Quantity      4687
QuantityU     4687
TrapType      4798
NumTraps      4798
Comments      2211
VisitType     2711
CollectTme    4570
Surveyor      3246
RecSource     4798
RecOwner         0
RecSrcTyp        0
OrigName      2510
Nativity         0
Host          4798
Host_Name     4798
VerifyMthd      70
IDCred           4
Verified         0
Reviewer      3922
ReviewDate    3922
OrgSrcID      4798
PID           4798
Voucher       4313
Museum        2684
MuseumRec   

In [145]:
#Check column values

raw_data['Verified'].unique()

array(['Verified'], dtype=object)

In [146]:
#Create DataFrame only containing required columns

clean_data = raw_data[['Reporter','ObsDate','DateEnt','Location','Longitude','Latitude','RecOwner']]

### Clean Data Columns

In [147]:
raw_data['Reporter'].unique()

array(['Travis Mangione FWC', 'Edward F. Metzger III ',
       'Michael Reupert NPS  Big Cypress National Preserve',
       'matthew mccollister National Park Service, Big Cypress National Park',
       'Jeffrey Fobb Miami-Dade Fire Rescue', 'Joshua Sands ',
       'Matthew Bowser ', 'Christen Mason SFWMD', 'Sarah Norris ',
       'Don Marchetto ',
       'Amy Peters South Florida Water Management District',
       'Ashley Provo ',
       'Catherine Gelston Biscayne National Park - National Park Service',
       'Vanessa McDonough Biscayne National Park', 'Amy Siewe None yet',
       'Leah Miller Florida Bat Conservancy, Fakahatchee Strand Preserve SP & FL Panther Natl. Wildlife Refuge',
       'Mark Danaher US Fish and Wildlife Service', 'Joey Cabrera ',
       'David Shindle ', 'annette johnson nps',
       'Jake Travers Florida Fish and Wildlife Conservation Commission',
       'FWCC Exotic Species Database Florida Fish and Wildlife Conservation Commission',
       'Madison Harman U

In [148]:
#Create and clean first name column

clean_data['First_Name'] = clean_data['Reporter'].apply(lambda x: re.search('\w+ ',x).group(0).title().rstrip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [149]:
#Check output of above code

clean_data['First_Name'].unique()

array(['Travis', 'Edward', 'Michael', 'Matthew', 'Jeffrey', 'Joshua',
       'Christen', 'Sarah', 'Don', 'Amy', 'Ashley', 'Catherine',
       'Vanessa', 'Leah', 'Mark', 'Joey', 'David', 'Annette', 'Jake',
       'Fwcc', 'Madison', 'Kevin', 'Cory', 'Wayne', 'Christopher',
       'Steve', 'Marsha', 'Jeremy', 'Trudy', 'Fred', 'Matt', 'Brian',
       'Deborah', 'Alexander', 'Anne', 'Bob', 'Jarek', 'Ozzie',
       'Elizabeth', 'Robert', 'Jesssica', 'Josh', 'Simon', 'Mathieu',
       'Ross', 'Meghan', 'Juan', 'Carlos', 'Tony', 'Jane', 'Bryan', 'Ian',
       'Jared', 'Ryan', 'Christina', 'Steven', 'Erin', 'Keith', 'Melody',
       'Mckenzie', 'Kurt', 'Norm', 'Megan', 'Andrew', 'Alex', 'Nicole',
       'Elena', 'Reece', 'Blake', 'Bill', 'Betsy', 'Chris', 'Mary',
       'Jesus', 'Caleb', 'Brooke', 'Julie', 'Lori', 'Molly', 'Hunter',
       'Harrison', 'Donna', 'Carrie', 'Sandra', 'Ellen', 'John', 'Thomas',
       'Ronnie', 'Rupert', 'Venom', 'Brenda', 'Danaisy', 'Tiffany',
       'Jennifer', 'K

In [150]:
#Clean leftover unclean data points

clean_data['First_Name'] = np.where(
    (clean_data['First_Name'].str.contains('Fwcc') 
     | clean_data['First_Name'].str.contains('Us') 
     | clean_data['First_Name'].str.contains('Snow') 
     | clean_data['First_Name'].str.contains('Snow') 
     | clean_data['First_Name'].str.contains('Skip') 
     | clean_data['First_Name'].str.contains('Natural') 
     | clean_data['First_Name'].str.contains('Venom')),
    'Government',
    clean_data['First_Name'],
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [151]:
#Final data check

clean_data['First_Name'].unique()

array(['Travis', 'Edward', 'Michael', 'Matthew', 'Jeffrey', 'Joshua',
       'Christen', 'Sarah', 'Don', 'Amy', 'Ashley', 'Catherine',
       'Vanessa', 'Leah', 'Mark', 'Joey', 'David', 'Annette', 'Jake',
       'Government', 'Madison', 'Kevin', 'Cory', 'Wayne', 'Christopher',
       'Steve', 'Marsha', 'Jeremy', 'Trudy', 'Fred', 'Matt', 'Brian',
       'Deborah', 'Alexander', 'Anne', 'Bob', 'Jarek', 'Ozzie',
       'Elizabeth', 'Robert', 'Jesssica', 'Josh', 'Simon', 'Mathieu',
       'Ross', 'Meghan', 'Juan', 'Carlos', 'Tony', 'Jane', 'Bryan', 'Ian',
       'Jared', 'Ryan', 'Christina', 'Steven', 'Erin', 'Keith', 'Melody',
       'Mckenzie', 'Kurt', 'Norm', 'Megan', 'Andrew', 'Alex', 'Nicole',
       'Elena', 'Reece', 'Blake', 'Bill', 'Betsy', 'Chris', 'Mary',
       'Jesus', 'Caleb', 'Brooke', 'Julie', 'Lori', 'Molly', 'Hunter',
       'Harrison', 'Donna', 'Carrie', 'Sandra', 'Ellen', 'John', 'Thomas',
       'Ronnie', 'Rupert', 'Brenda', 'Danaisy', 'Tiffany', 'Jennifer',
       'Kell

In [152]:
#Create and clean last name column

clean_data['Last_Name'] = clean_data['Reporter'].apply(lambda x: x.split(' ')[1].title().strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [153]:
#Check output from code above

clean_data.head()

Unnamed: 0,Reporter,ObsDate,DateEnt,Location,Longitude,Latitude,RecOwner,First_Name,Last_Name
0,Travis Mangione FWC,24 Nov 2019,26 Nov 2019,"Miami-Dade, Florida, United States",-80.74829,25.76191,Unknown,Travis,Mangione
1,Edward F. Metzger III,20 Nov 2019,22 Nov 2019,"Broward, Florida, United States",-80.50567,26.29298,Unknown,Edward,F.
2,Michael Reupert NPS Big Cypress National Pres...,07 Nov 2019,12 Nov 2019,"Collier, Florida, United States",-81.10691,25.86434,Unknown,Michael,Reupert
3,"matthew mccollister National Park Service, Big...",06 Nov 2019,07 Nov 2019,"Collier, Florida, United States",-80.90326,25.83131,Unknown,Matthew,Mccollister
4,"matthew mccollister National Park Service, Big...",05 Nov 2019,07 Nov 2019,"Collier, Florida, United States",-81.18223,25.87197,Unknown,Matthew,Mccollister


In [154]:
clean_data['Last_Name'].unique()

array(['Mangione', 'F.', 'Reupert', 'Mccollister', 'Fobb', 'Sands',
       'Bowser', 'Mason', 'Norris', 'Marchetto', 'Peters', 'Provo',
       'Gelston', 'Mcdonough', 'Siewe', 'Miller', 'Danaher', 'Cabrera',
       'Shindle', 'Johnson', 'Travers', 'Exotic', 'Harman', 'Reich',
       'Carlisle', 'Gillis', 'Forsythe', 'Owensby', 'Gillette', 'Schulze',
       'Wheatley', 'Dixon', 'Ferraro', 'Rice', 'Weiser', 'Waters',
       'Jansen', 'Edwards', 'Flores', 'Wood', 'Gore', 'Anzelmo', 'Romero',
       'Scarlett', 'Hinson', 'Procter', 'Solis', 'Boyd', 'Williams',
       'Basille', 'Firth', 'Kornofski', 'Cravens', 'Massatt', 'Ospina',
       'Rodas', 'Pernas', 'Dozier', 'Earp', 'Bartoszek', 'Franklin',
       'Shaffer', 'Brown', 'Stylianos', 'Bass', 'Gallagher', 'Gifford',
       'Ray-Culp', 'Stewart', 'Cox', 'Johnston', 'Fisher', 'Digeon',
       'Pruchinski', 'Hoffman', 'Laza', 'Cortez', 'Suarez', 'Hammock',
       'Russ', 'Booth', 'Kozakoff', 'Hipskind', 'Haley', 'Hopkins',
       'Helen', 

In [155]:
#Clean leftover unclean data points

clean_data['Last_Name'] = np.where(
    clean_data['First_Name'].str.contains('Government'),
    'Agency',
    clean_data['Last_Name'],
)

clean_data['Last_Name'] = np.where(
    clean_data['Last_Name'].str.contains('F.'),
    'Metzger',
    clean_data['Last_Name'],
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [156]:
#Create new column for the county where the observations occured

clean_data['County'] = clean_data['Location'].apply(lambda x: re.sub(',.*','', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [157]:
# clean_data[clean_data['First_Name'] == 'Wayne']

In [158]:
#Clean location column to only include observations that occured in Florida

clean_data = clean_data[clean_data['Location'].str.contains('Florida')]

In [159]:
#Check output of above code

clean_data['Location'].unique()

array(['Miami-Dade, Florida, United States',
       'Broward, Florida, United States',
       'Collier, Florida, United States',
       'Monroe, Florida, United States',
       'Palm Beach, Florida, United States',
       'Polk, Florida, United States', 'Orange, Florida, United States',
       'Manatee, Florida, United States',
       'Sarasota, Florida, United States',
       'Volusia, Florida, United States', 'Lee, Florida, United States',
       'Glades, Florida, United States',
       'Okeechobee, Florida, United States',
       'Hendry, Florida, United States',
       'Alachua, Florida, United States',
       'Hernando, Florida, United States',
       'Marion, Florida, United States', 'Pasco, Florida, United States',
       'Pinellas, Florida, United States',
       'Highlands, Florida, United States',
       'Osceola, Florida, United States',
       'Charlotte, Florida, United States',
       'St. Lucie, Florida, United States',
       'Clay, Florida, United States', 'Martin, Flo

In [160]:
#Final Data Check

clean_data.head()

Unnamed: 0,Reporter,ObsDate,DateEnt,Location,Longitude,Latitude,RecOwner,First_Name,Last_Name,County
0,Travis Mangione FWC,24 Nov 2019,26 Nov 2019,"Miami-Dade, Florida, United States",-80.74829,25.76191,Unknown,Travis,Mangione,Miami-Dade
1,Edward F. Metzger III,20 Nov 2019,22 Nov 2019,"Broward, Florida, United States",-80.50567,26.29298,Unknown,Edward,Metzger,Broward
2,Michael Reupert NPS Big Cypress National Pres...,07 Nov 2019,12 Nov 2019,"Collier, Florida, United States",-81.10691,25.86434,Unknown,Michael,Reupert,Collier
3,"matthew mccollister National Park Service, Big...",06 Nov 2019,07 Nov 2019,"Collier, Florida, United States",-80.90326,25.83131,Unknown,Matthew,Mccollister,Collier
4,"matthew mccollister National Park Service, Big...",05 Nov 2019,07 Nov 2019,"Collier, Florida, United States",-81.18223,25.87197,Unknown,Matthew,Mccollister,Collier


### Send DataFrame to CSV

In [134]:
clean_data.to_csv('Python_Gov_Observations_Clean.csv', index = False)