In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels as stats

In [2]:
df = pd.read_csv('../data/ufo_data/ufo-sightings-transformed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description
0,0,1949-10-10 20:30:00,4/27/2004,1949,10,20,Autumn,USA,United States,Texas,San Marcos,29.883056,-97.941111,Cylinder,2700.0,45 minutes,This event took place in early fall around 194...
1,1,1949-10-10 21:00:00,12/16/2005,1949,10,21,Autumn,USA,United States,Texas,Bexar County,29.38421,-98.581082,Light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...
2,2,1955-10-10 17:00:00,1/21/2008,1955,10,17,Autumn,GBR,United Kingdom,England,Chester,53.2,-2.916667,Circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...
3,3,1956-10-10 21:00:00,1/17/2004,1956,10,21,Autumn,USA,United States,Texas,Edna,28.978333,-96.645833,Circle,20.0,1/2 hour,My older brother and twin sister were leaving ...
4,4,1960-10-10 20:00:00,1/22/2004,1960,10,20,Autumn,USA,United States,Hawaii,Kaneohe,21.418056,-157.803611,Light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80328 entries, 0 to 80327
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   80328 non-null  int64  
 1   Date_time                    80328 non-null  object 
 2   date_documented              80328 non-null  object 
 3   Year                         80328 non-null  int64  
 4   Month                        80328 non-null  int64  
 5   Hour                         80328 non-null  int64  
 6   Season                       80328 non-null  object 
 7   Country_Code                 80069 non-null  object 
 8   Country                      80069 non-null  object 
 9   Region                       79762 non-null  object 
 10  Locale                       79871 non-null  object 
 11  latitude                     80328 non-null  float64
 12  longitude                    80328 non-null  float64
 13  UFO_shape       

In [4]:
df['UFO_shape'].value_counts()

UFO_shape
Light        16565
Triangle      7865
Circle        7607
Fireball      6208
Other         5649
Unknown       5584
Sphere        5387
Disk          5213
Oval          3733
Formation     2457
Cigar         2057
Changing      1962
Flash         1328
Rectangle     1296
Cylinder      1283
Diamond       1178
Chevron        952
Egg            759
Teardrop       750
Cone           316
Cross          233
Delta            7
Round            2
Crescent         2
Pyramid          1
Flare            1
Hexagon          1
Dome             1
Changed          1
Name: count, dtype: int64

In [48]:
df_cleaned = df.drop(columns='Unnamed: 0')
df_cleaned['UFO_shape'] = df_cleaned['UFO_shape'].fillna('Unknown')
df_cleaned.columns

Index(['Date_time', 'date_documented', 'Year', 'Month', 'Hour', 'Season',
       'Country_Code', 'Country', 'Region', 'Locale', 'latitude', 'longitude',
       'UFO_shape', 'length_of_encounter_seconds', 'Encounter_Duration',
       'Description', 'UFO_shape_binned'],
      dtype='object')

In [49]:

def shape_binner(shape_list):
    binned_shapes = []
    for shape in shape_list:
        if shape in ['Light', 'Fireball', 'Flash', 'Flare']:
            binned_shapes.append('Light-Only')
        elif shape in ['Triangle', 'Diamond', 'Teardrop', 'Cone', 'Chevron', 'Delta','Pyramid']:
            binned_shapes.append('Tapered')
        elif shape in ['Circle', 'Sphere','Disk','Oval','Egg','Dome','Round']:
            binned_shapes.append('Rounded')
        elif shape in ['Formation', 'Changing', 'Hexagon','Crescent','Changed']:
            binned_shapes.append('Other')
        elif shape in ['Cigar','Rectangle','Cylinder','Cross']:
            binned_shapes.append('Elongated-Symmetric')
        else:
            binned_shapes.append(shape)
    return binned_shapes

In [50]:
df_cleaned['UFO_shape_binned'] = shape_binner(df['UFO_shape'])

In [51]:
df_cleaned['UFO_shape_binned'].value_counts()

UFO_shape_binned
Light-Only             24102
Rounded                22702
Tapered                11069
Other                  10072
Unknown                 7514
Elongated-Symmetric     4869
Name: count, dtype: int64

In [52]:
df_USA = df_cleaned[df['Country_Code']=='USA']
df_USA.head()

Unnamed: 0,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned
0,1949-10-10 20:30:00,4/27/2004,1949,10,20,Autumn,USA,United States,Texas,San Marcos,29.883056,-97.941111,Cylinder,2700.0,45 minutes,This event took place in early fall around 194...,Elongated-Symmetric
1,1949-10-10 21:00:00,12/16/2005,1949,10,21,Autumn,USA,United States,Texas,Bexar County,29.38421,-98.581082,Light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,Light-Only
3,1956-10-10 21:00:00,1/17/2004,1956,10,21,Autumn,USA,United States,Texas,Edna,28.978333,-96.645833,Circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,Rounded
4,1960-10-10 20:00:00,1/22/2004,1960,10,20,Autumn,USA,United States,Hawaii,Kaneohe,21.418056,-157.803611,Light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,Light-Only
5,1961-10-10 19:00:00,4/27/2007,1961,10,19,Autumn,USA,United States,Tennessee,Bristol,36.595,-82.188889,Sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,Rounded


In [58]:
df_USA_recent = df_USA[df['Year']>2010].reset_index()
df_USA_recent.head()

  df_USA_recent = df_USA[df['Year']>2010].reset_index()


Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,Troy,42.728333,-73.692222,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,Farmington,36.728056,-108.218056,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,Prescott Valley,34.61,-112.315,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,Ashville,42.096389,-79.375833,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,Wickford,41.55,-71.466667,Oval,40.0,40 sec,Bright oval object in sky,Rounded


In [59]:
df_USA_recent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20078 entries, 0 to 20077
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        20078 non-null  int64  
 1   Date_time                    20078 non-null  object 
 2   date_documented              20078 non-null  object 
 3   Year                         20078 non-null  int64  
 4   Month                        20078 non-null  int64  
 5   Hour                         20078 non-null  int64  
 6   Season                       20078 non-null  object 
 7   Country_Code                 20078 non-null  object 
 8   Country                      20078 non-null  object 
 9   Region                       20078 non-null  object 
 10  Locale                       20078 non-null  object 
 11  latitude                     20078 non-null  float64
 12  longitude                    20078 non-null  float64
 13  UFO_shape       

In [75]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ufo_loc")

import time


In [73]:
df_USA_recent['lat_long'] = list(zip(df_USA_recent['latitude'],df_USA_recent['longitude']))
df_USA_recent['lat_long']

0         (42.7283333, -73.6922222)
1        (36.7280556, -108.2180556)
2                 (34.61, -112.315)
3         (42.0963889, -79.3758333)
4              (41.55, -71.4666667)
                    ...            
20073     (36.1658333, -86.7844444)
20074       (43.6136111, -116.2025)
20075    (38.2972222, -122.2844444)
20076     (38.9011111, -77.2655556)
20077     (35.6527778, -97.4777778)
Name: lat_long, Length: 20078, dtype: object

In [108]:

def get_counties(place_df):
    
    county_addresses = []

    for place in place_df['lat_long']:
        time.sleep(.5)
        county_addresses.append(geolocator.reverse(place, zoom=8)[0])

    return county_addresses


In [109]:
test_getter_df = df_USA_recent.iloc[0:10]

test_getter_df['county_address'] = get_counties(test_getter_df) #it works but doing the whole dataset would take too long. And what if a request threw an error?

test_getter_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_getter_df['county_address'] = get_counties(test_getter_df) #it works but doing the whole dataset would take too long. And what if a request threw an error?


Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,...,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,rg_county,county_address
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,...,-73.692222,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered,"(42.7283333, -73.6922222)","{'lat': '42.72841', 'lon': '-73.69179', 'name'...","[Rensselaer County, New York]","Rensselaer County, New York, United States"
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,...,-108.218056,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded,"(36.7280556, -108.2180556)","{'lat': '36.72806', 'lon': '-108.21869', 'name...","[San Juan County, New Mexico]","San Juan County, New Mexico, United States"
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,...,-112.315,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other,"(34.61, -112.315)","{'lat': '34.61002', 'lon': '-112.31572', 'name...","[Yavapai County, Arizona]","Yavapai County, Arizona, United States"
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,...,-79.375833,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded,"(42.0963889, -79.3758333)","{'lat': '42.10422', 'lon': '-79.3331', 'name':...","[Chautauqua County, New York]","Chautauqua County, New York, United States"
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,...,-71.466667,Oval,40.0,40 sec,Bright oval object in sky,Rounded,"(41.55, -71.4666667)","{'lat': '41.5501', 'lon': '-71.46617', 'name':...","[Washington County, Rhode Island]","South County, Rhode Island, United States"
5,236,2011-10-10 15:00:00,10/10/2011,2011,10,15,Autumn,USA,United States,Connecticut,...,-72.078889,Disk,5.0,less than 5 seconds,Small shiny object seen in sky while driving o...,Rounded,"(41.35, -72.0788889)","{'lat': '41.3501', 'lon': '-72.07841', 'name':...","[New London County, Connecticut]","Southeastern Connecticut Planning Region, Conn..."
6,237,2011-10-10 18:15:00,10/19/2011,2011,10,18,Autumn,USA,United States,Massachusetts,...,-71.014118,Flash,2700.0,45 minutes,Flashing light in the sky as airplanes flew by.,Light-Only,"(42.468164, -71.014118)","{'lat': '42.46482', 'lon': '-71.01005', 'name'...","[Essex County, Massachusetts]","Essex County, Massachusetts, United States"
7,238,2011-10-10 19:00:00,10/19/2011,2011,10,19,Autumn,USA,United States,Virginia,...,-77.373611,Light,300.0,5 minutes,Three orange lights flying in unison,Light-Only,"(37.6086111, -77.3736111)","{'lat': '37.60876', 'lon': '-77.37331', 'name'...","[Hanover County, Virginia]","Hanover County, Virginia, United States"
8,239,2011-10-10 19:30:00,10/19/2011,2011,10,19,Autumn,USA,United States,Tennessee,...,-86.488367,Unknown,2700.0,30-45 minutes,Multi color oblect over Smyrna/Murfreesboro 10...,Unknown,"(35.947474, -86.488367)","{'lat': '35.98284', 'lon': '-86.5186', 'name':...","[Rutherford County, Tennessee]","Rutherford County, Middle Tennessee, Tennessee..."
9,241,2011-10-10 20:00:00,10/19/2011,2011,10,20,Autumn,USA,United States,Connecticut,...,-72.651111,Fireball,1200.0,15-20 minutes,Fireball Spinning Orange UFO,Light-Only,"(41.5622222, -72.6511111)","{'lat': '41.56232', 'lon': '-72.65065', 'name'...","[Middlesex County, Connecticut]",Lower Connecticut River Valley Planning Region...


In [87]:
import reverse_geocoder as rg

rg_test=rg.search(test_getter_df['lat_long'][8])

rg_test

[{'lat': '35.98284',
  'lon': '-86.5186',
  'name': 'Smyrna',
  'admin1': 'Tennessee',
  'admin2': 'Rutherford County',
  'cc': 'US'}]

In [92]:
test_getter_df['county_address_2'] = rg.search(list(test_getter_df['lat_long'])) #that was easy. wish I'd found this first

test_getter_df['county'] = [dic['admin2'] for dic in test_getter_df['county_address_2']]

test_getter_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_getter_df['county_address_2'] = rg.search(list(test_getter_df['lat_long']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_getter_df['county'] = [dic['admin2'] for dic in test_getter_df['county_address_2']]


Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,...,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,county_address,county_address_2,county
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,...,-73.692222,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered,"(42.7283333, -73.6922222)","Rensselaer County, New York, United States","{'lat': '42.72841', 'lon': '-73.69179', 'name'...",Rensselaer County
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,...,-108.218056,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded,"(36.7280556, -108.2180556)","San Juan County, New Mexico, United States","{'lat': '36.72806', 'lon': '-108.21869', 'name...",San Juan County
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,...,-112.315,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other,"(34.61, -112.315)","Yavapai County, Arizona, United States","{'lat': '34.61002', 'lon': '-112.31572', 'name...",Yavapai County
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,...,-79.375833,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded,"(42.0963889, -79.3758333)","Chautauqua County, New York, United States","{'lat': '42.10422', 'lon': '-79.3331', 'name':...",Chautauqua County
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,...,-71.466667,Oval,40.0,40 sec,Bright oval object in sky,Rounded,"(41.55, -71.4666667)","South County, Rhode Island, United States","{'lat': '41.5501', 'lon': '-71.46617', 'name':...",Washington County
5,236,2011-10-10 15:00:00,10/10/2011,2011,10,15,Autumn,USA,United States,Connecticut,...,-72.078889,Disk,5.0,less than 5 seconds,Small shiny object seen in sky while driving o...,Rounded,"(41.35, -72.0788889)","Southeastern Connecticut Planning Region, Conn...","{'lat': '41.3501', 'lon': '-72.07841', 'name':...",New London County
6,237,2011-10-10 18:15:00,10/19/2011,2011,10,18,Autumn,USA,United States,Massachusetts,...,-71.014118,Flash,2700.0,45 minutes,Flashing light in the sky as airplanes flew by.,Light-Only,"(42.468164, -71.014118)","Essex County, Massachusetts, United States","{'lat': '42.46482', 'lon': '-71.01005', 'name'...",Essex County
7,238,2011-10-10 19:00:00,10/19/2011,2011,10,19,Autumn,USA,United States,Virginia,...,-77.373611,Light,300.0,5 minutes,Three orange lights flying in unison,Light-Only,"(37.6086111, -77.3736111)","Hanover County, Virginia, United States","{'lat': '37.60876', 'lon': '-77.37331', 'name'...",Hanover County
8,239,2011-10-10 19:30:00,10/19/2011,2011,10,19,Autumn,USA,United States,Tennessee,...,-86.488367,Unknown,2700.0,30-45 minutes,Multi color oblect over Smyrna/Murfreesboro 10...,Unknown,"(35.947474, -86.488367)","Rutherford County, Middle Tennessee, Tennessee...","{'lat': '35.98284', 'lon': '-86.5186', 'name':...",Rutherford County
9,241,2011-10-10 20:00:00,10/19/2011,2011,10,20,Autumn,USA,United States,Connecticut,...,-72.651111,Fireball,1200.0,15-20 minutes,Fireball Spinning Orange UFO,Light-Only,"(41.5622222, -72.6511111)",Lower Connecticut River Valley Planning Region...,"{'lat': '41.56232', 'lon': '-72.65065', 'name'...",Middlesex County


In [93]:
df_USA_recent['rg_address'] = rg.search(list(df_USA_recent['lat_long']))


In [94]:
df_USA_recent.head()

Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,Troy,42.728333,-73.692222,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered,"(42.7283333, -73.6922222)","{'lat': '42.72841', 'lon': '-73.69179', 'name'..."
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,Farmington,36.728056,-108.218056,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded,"(36.7280556, -108.2180556)","{'lat': '36.72806', 'lon': '-108.21869', 'name..."
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,Prescott Valley,34.61,-112.315,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other,"(34.61, -112.315)","{'lat': '34.61002', 'lon': '-112.31572', 'name..."
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,Ashville,42.096389,-79.375833,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded,"(42.0963889, -79.3758333)","{'lat': '42.10422', 'lon': '-79.3331', 'name':..."
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,Wickford,41.55,-71.466667,Oval,40.0,40 sec,Bright oval object in sky,Rounded,"(41.55, -71.4666667)","{'lat': '41.5501', 'lon': '-71.46617', 'name':..."


In [159]:
df_USA_recent['rg_county']=[str(dict['admin2']+", "+dict['admin1']) for dict in df_USA_recent['rg_address']]

df_USA_recent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20078 entries, 0 to 20077
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        20078 non-null  int64  
 1   Date_time                    20078 non-null  object 
 2   date_documented              20078 non-null  object 
 3   Year                         20078 non-null  int64  
 4   Month                        20078 non-null  int64  
 5   Hour                         20078 non-null  int64  
 6   Season                       20078 non-null  object 
 7   Country_Code                 20078 non-null  object 
 8   Country                      20078 non-null  object 
 9   Region                       20078 non-null  object 
 10  Locale                       20078 non-null  object 
 11  latitude                     20078 non-null  float64
 12  longitude                    20078 non-null  float64
 13  UFO_shape       

In [123]:
df_USA_recent.head()

Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,...,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,rg_county
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,...,42.728333,-73.692222,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered,"(42.7283333, -73.6922222)","{'lat': '42.72841', 'lon': '-73.69179', 'name'...","Rensselaer County, New York"
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,...,36.728056,-108.218056,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded,"(36.7280556, -108.2180556)","{'lat': '36.72806', 'lon': '-108.21869', 'name...","San Juan County, New Mexico"
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,...,34.61,-112.315,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other,"(34.61, -112.315)","{'lat': '34.61002', 'lon': '-112.31572', 'name...","Yavapai County, Arizona"
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,...,42.096389,-79.375833,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded,"(42.0963889, -79.3758333)","{'lat': '42.10422', 'lon': '-79.3331', 'name':...","Chautauqua County, New York"
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,...,41.55,-71.466667,Oval,40.0,40 sec,Bright oval object in sky,Rounded,"(41.55, -71.4666667)","{'lat': '41.5501', 'lon': '-71.46617', 'name':...","Washington County, Rhode Island"


In [98]:
df_USA_recent['rg_county'].value_counts()

rg_county
[Los Angeles County, California]     390
[Maricopa County, Arizona]           340
[King County, Washington]            277
[San Diego County, California]       212
[Cook County, Illinois]              179
                                    ... 
[Bradford County, Pennsylvania]        1
[Plaquemines Parish, Louisiana]        1
[Scotland County, North Carolina]      1
[Lee County, Georgia]                  1
[Branch County, Michigan]              1
Name: count, Length: 2078, dtype: int64

In [178]:
df_county_pop_weights = pd.read_csv('../data/ufo_data/County_population_weights_2011_to_2014.csv')

In [125]:
df_county_pop_weights.head()

Unnamed: 0,County,2011,2012,2013,2014
0,"Autauga County, Alabama",0.000177,0.000175,0.000173,0.000172
1,"Baldwin County, Alabama",0.000599,0.000606,0.000617,0.000626
2,"Barbour County, Alabama",8.8e-05,8.7e-05,8.5e-05,8.4e-05
3,"Bibb County, Alabama",7.3e-05,7.2e-05,7.1e-05,7.1e-05
4,"Blount County, Alabama",0.000185,0.000183,0.000182,0.000181


In [179]:
df_county_pop_weights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   County  3142 non-null   object 
 1   2011    3142 non-null   float64
 2   2012    3142 non-null   float64
 3   2013    3142 non-null   float64
 4   2014    3142 non-null   float64
dtypes: float64(4), object(1)
memory usage: 122.9+ KB


In [145]:
df_county_incidence = df_USA_recent['rg_county'].value_counts().reset_index()

In [146]:
df_county_incidence

Unnamed: 0,rg_county,count
0,"Los Angeles County, California",390
1,"Maricopa County, Arizona",340
2,"King County, Washington",277
3,"San Diego County, California",212
4,"Cook County, Illinois",179
...,...,...
2073,"Bradford County, Pennsylvania",1
2074,"Plaquemines Parish, Louisiana",1
2075,"Scotland County, North Carolina",1
2076,"Lee County, Georgia",1


In [147]:
df_county_heat_info = df_county_incidence.merge(df_county_pop_weights,how='left',left_on='rg_county',right_on='County')

df_county_heat_info.head()

Unnamed: 0,rg_county,count,County,2011,2012,2013,2014
0,"Los Angeles County, California",390,"Los Angeles County, California",0.0317,0.031658,0.031622,0.031543
1,"Maricopa County, Arizona",340,"Maricopa County, Arizona",0.012438,0.012578,0.012715,0.012861
2,"King County, Washington",277,"King County, Washington",0.006337,0.006409,0.006479,0.006551
3,"San Diego County, California",212,"San Diego County, California",0.010069,0.010115,0.010158,0.010207
4,"Cook County, Illinois",179,"Cook County, Illinois",0.016754,0.016695,0.016623,0.016509


In [148]:
df_county_heat_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2078 entries, 0 to 2077
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rg_county  2078 non-null   object 
 1   count      2078 non-null   int64  
 2   County     2010 non-null   object 
 3   2011       2010 non-null   float64
 4   2012       2010 non-null   float64
 5   2013       2010 non-null   float64
 6   2014       2010 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 113.8+ KB


In [149]:
df_county_heat_info.loc[df_county_heat_info['County'].isna()] #68 counties with name conflict, let's get the list to build a dictionary

Unnamed: 0,rg_county,count,County,2011,2012,2013,2014
25,", New York",89,,,,,
71,"Saint Charles County, Missouri",51,,,,,
91,"City of Saint Louis, Missouri",43,,,,,
121,"Saint Louis County, Missouri",36,,,,,
123,"City of Virginia Beach, Virginia",35,,,,,
...,...,...,...,...,...,...,...
1878,"City of Emporia, Virginia",1,,,,,
1935,", Saint Thomas Island",1,,,,,
1967,"Saint Clair County, Missouri",1,,,,,
2034,"Saint Mary Parish, Louisiana",1,,,,,


In [153]:
county_name_conflict = list(df_county_heat_info['rg_county'].loc[df_county_heat_info['County'].isna()])
county_name_conflict

[', New York',
 'Saint Charles County, Missouri',
 'City of Saint Louis, Missouri',
 'Saint Louis County, Missouri',
 'City of Virginia Beach, Virginia',
 'Saint Lucie County, Florida',
 'Saint Louis County, Minnesota',
 'City of Richmond, Virginia',
 'City of Baltimore, Maryland',
 'Saint Johns County, Florida',
 ', Washington, D.C.',
 'Dona Ana County, New Mexico',
 'City of Fredericksburg, Virginia',
 'Saint Joseph County, Indiana',
 'City of Chesapeake, Virginia',
 'City of Alexandria, Virginia',
 'Saint Tammany Parish, Louisiana',
 'City of Hampton, Virginia',
 'City of Norfolk, Virginia',
 'Saint Clair County, Michigan',
 'Saint Clair County, Illinois',
 'Saint Croix County, Wisconsin',
 'City of Roanoke, Virginia',
 'City of Winchester, Virginia',
 'City of Lynchburg, Virginia',
 'De Soto County, Mississippi',
 'City of Charlottesville, Virginia',
 'City of Newport News, Virginia',
 'Saint Francois County, Missouri',
 'City of Fairfax, Virginia',
 'City of Manassas, Virginia',
 

In [152]:
df_USA_recent.loc[df_USA_recent['rg_county']==", New York"] #These all appear coded to Manhattan - New York County, New York.

Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,...,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,rg_county
22,254,2012-10-10 20:15:00,10/30/2012,2012,10,20,Autumn,USA,United States,New York,...,40.714167,-74.006389,Disk,1800.0,30 minutes,A cloaked disk hovered three hundred feet abov...,Rounded,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
288,1474,2013-10-13 22:55:00,10/14/2013,2013,10,22,Autumn,USA,United States,New York,...,40.714167,-74.006389,Sphere,420.0,7 minutes,Glowing lights above Brooklyn.,Rounded,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
662,3352,2012-10-20 17:30:00,10/30/2012,2012,10,17,Autumn,USA,United States,New York,...,40.714167,-74.006389,Sphere,10.0,10 seconds,Reflective Silver/Orange sphere over Brooklyn,Rounded,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
732,3552,2012-10-21 04:00:00,10/30/2012,2012,10,4,Autumn,USA,United States,New York,...,40.712784,-74.005941,Chevron,15.0,15 seconds,Strange&#44 gliding light formation/object on ...,Tapered,"(40.712784, -74.005941)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
1180,4965,2012-10-27 21:35:00,10/30/2012,2012,10,21,Autumn,USA,United States,New York,...,40.714167,-74.006389,Fireball,240.0,4 minutes,Two bright&#44 orange glowing orbs flying in f...,Light-Only,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17891,71265,2013-08-03 21:30:00,8/30/2013,2013,8,21,Summer,USA,United States,New York,...,40.714167,-74.006389,Fireball,120.0,2 minutes,4 reddish/ orange objects seem in the sky.,Light-Only,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
18065,71936,2012-08-06 00:55:00,8/19/2012,2012,8,0,Summer,USA,United States,New York,...,40.714167,-74.006389,Light,180.0,3 minutes,Loud humming/vibrating noise and bright blue l...,Light-Only,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
18957,75959,2012-09-19 14:00:00,6/2/2013,2012,9,14,Autumn,USA,United States,New York,...,40.714167,-74.006389,Disk,600.0,5-10+ minutes,Noticed an object move across the sky&#44 even...,Rounded,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"
19894,79642,2012-09-06 21:04:00,9/24/2012,2012,9,21,Autumn,USA,United States,New York,...,40.714167,-74.006389,Fireball,600.0,10 minutes,Four orange fireball looking objects floating ...,Light-Only,"(40.7141667, -74.0063889)","{'lat': '40.71427', 'lon': '-74.00597', 'name'...",", New York"


In [269]:
#Grabbed the whole list to just quickly cross reference by hand to a fixed list

conflict_corresponders = ['New York County, New York',
 'St. Charles County, Missouri',
 'St. Louis City, Missouri',
 'St. Louis County, Missouri',
 'Virginia Beach City, Virginia',
 'St. Lucie County, Florida',
 'St. Louis County, Minnesota',
 'Richmond City, Virginia',
 'Baltimore City, Maryland',
 'St. Johns County, Florida',
 'District of Columbia, District of Columbia',
 'Doña Ana County, New Mexico',
 'Fredericksburg City, Virginia',
 'St. Joseph County, Indiana',
 'Chesapeake City, Virginia',
 'Alexandria City, Virginia',
 'St. Tammany Parish, Louisiana',
 'Hampton City, Virginia',
 'Norfolk City, Virginia',
 'St. Clair County, Michigan',
 'St. Clair County, Illinois',
 'St. Croix County, Wisconsin',
 'Roanoke City, Virginia',
 'Winchester City, Virginia',
 'Lynchburg City, Virginia',
 'DeSoto County, Mississippi',
 'Charlottesville City, Virginia',
 'Newport News City, Virginia',
 'St. Francois County, Missouri',
 'Fairfax City, Virginia',
 'Manassas City, Virginia',
 'Danville City, Virginia',
 "St. Mary's County, Maryland",
 'Falls Church City, Virginia',
 'Hopewell City, Virginia',
 'Staunton City, Virginia',
 'Portsmouth City, Virginia',
 'Harrisonburg City, Virginia',
 'Waynesboro City, Virginia',
 'Bristol City, Virginia',
 'St. Bernard Parish, Louisiana',
 'Petersburg City, Virginia',
 ', Chukotskiy Avtonomnyy Okrug',
 'St. Charles Parish, Louisiana',
 'Williamsburg City, Virginia',
 'St. Joseph County, Michigan',
 ', British Columbia',
 'Suffolk City, Virginia',
 ', Ontario',
 'Radford City, Virginia',
 ', San Juan',
 'Colonial Heights City, Virginia',
 'Poquoson City, Virginia',
 'St. Clair County, Alabama',
 'St. Landry Parish, Louisiana',
 'Franklin City, Virginia',
 'Buena Vista City, Virginia',
 'Bedford County, Virginia',
 ', Mangilao',
 'Covington City, Virginia',
 ', Baja California',
 'Martinsville City, Virginia',
 ', Quebradillas',
 'Emporia City, Virginia',
 ', St. Thomas Island',
 'St. Clair County, Missouri',
 'St. Mary Parish, Louisiana',
 'Kusilvak Census Area, Alaska',
 'Bronx County, New York',
 'Oglala Lakota County, South Dakota',
 'Prince of Wales-Hyder Census Area, Alaska',
 'Salem City, Virginia',
 'Wrangell City and Borough, Alaska',
 'Lexington City, Virginia',
 'St. Francis County, Arkansas',
 'St. Martin Parish, Louisiana',
 'St. James Parish, Louisiana',
 'LaSalle Parish, Louisiana',
 'Ste. Genevieve County, Missouri',
 'Norton City, Virginia',
 'St. Helena Parish, Louisiana',
 'Petersburg Borough, Alaska',
 'Galax City, Virginia']


In [271]:
county_conflict_dict = {county_name_conflict[i]:conflict_corresponders[i] for i in range(len(county_name_conflict))}

In [257]:
def county_conflict_fix(county):
    if county in county_name_conflict:
        return county_conflict_dict[county]
    else: 
        return county

df_USA_recent['rg_county_repaired']=df_USA_recent['rg_county'].apply(county_conflict_fix)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_USA_recent['rg_county_repaired']=df_USA_recent['rg_county'].apply(county_conflict_fix)


In [166]:
df_USA_recent.head()

Unnamed: 0,index,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,...,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,rg_county,County,rg_county_repaired
0,230,2011-10-10 00:00:00,10/10/2011,2011,10,0,Autumn,USA,United States,New York,...,Triangle,7200.0,2 hours,Red&#44 green &amp; orange blinking triangle f...,Tapered,"(42.7283333, -73.6922222)","{'lat': '42.72841', 'lon': '-73.69179', 'name'...","Rensselaer County, New York","Rensselaer County, New York","Rensselaer County, New York"
1,231,2011-10-10 01:00:00,8/30/2013,2011,10,1,Autumn,USA,United States,New Mexico,...,Circle,300.0,5 minutes,Single reddish circle in the sky that wasn&#3...,Rounded,"(36.7280556, -108.2180556)","{'lat': '36.72806', 'lon': '-108.21869', 'name...","San Juan County, New Mexico","San Juan County, New Mexico","San Juan County, New Mexico"
2,232,2011-10-10 02:00:00,10/10/2011,2011,10,2,Autumn,USA,United States,Arizona,...,Other,300.0,hours,Craft boomerang shape.2:00am duration hours. ...,Other,"(34.61, -112.315)","{'lat': '34.61002', 'lon': '-112.31572', 'name...","Yavapai County, Arizona","Yavapai County, Arizona","Yavapai County, Arizona"
3,233,2011-10-10 10:30:00,10/19/2011,2011,10,10,Autumn,USA,United States,New York,...,Circle,60.0,1 minute,Amber object in night sky during full moon&#44...,Rounded,"(42.0963889, -79.3758333)","{'lat': '42.10422', 'lon': '-79.3331', 'name':...","Chautauqua County, New York","Chautauqua County, New York","Chautauqua County, New York"
4,235,2011-10-10 14:30:00,10/25/2011,2011,10,14,Autumn,USA,United States,Rhode Island,...,Oval,40.0,40 sec,Bright oval object in sky,Rounded,"(41.55, -71.4666667)","{'lat': '41.5501', 'lon': '-71.46617', 'name':...","Washington County, Rhode Island","Washington County, Rhode Island","Washington County, Rhode Island"


In [199]:
df_county_incidence = df_USA_recent['rg_county_repaired'].value_counts().reset_index()
df_county_incidence

Unnamed: 0,rg_county_repaired,count
0,"Los Angeles County, California",390
1,"Maricopa County, Arizona",340
2,"King County, Washington",277
3,"San Diego County, California",212
4,"Cook County, Illinois",179
...,...,...
2072,"Lyon County, Minnesota",1
2073,"Bradford County, Pennsylvania",1
2074,"Plaquemines Parish, Louisiana",1
2075,"Scotland County, North Carolina",1


In [200]:
df_county_heat_info = df_county_incidence.merge(df_county_pop_weights,how='left',left_on='rg_county_repaired',right_on='County')

df_county_heat_info.head()

Unnamed: 0,rg_county_repaired,count,County,2011,2012,2013,2014
0,"Los Angeles County, California",390,"Los Angeles County, California",0.0317,0.031658,0.031622,0.031543
1,"Maricopa County, Arizona",340,"Maricopa County, Arizona",0.012438,0.012578,0.012715,0.012861
2,"King County, Washington",277,"King County, Washington",0.006337,0.006409,0.006479,0.006551
3,"San Diego County, California",212,"San Diego County, California",0.010069,0.010115,0.010158,0.010207
4,"Cook County, Illinois",179,"Cook County, Illinois",0.016754,0.016695,0.016623,0.016509


In [201]:
df_county_heat_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077 entries, 0 to 2076
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   rg_county_repaired  2077 non-null   object 
 1   count               2077 non-null   int64  
 2   County              2068 non-null   object 
 3   2011                2068 non-null   float64
 4   2012                2068 non-null   float64
 5   2013                2068 non-null   float64
 6   2014                2068 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 113.7+ KB


In [202]:
df_county_heat_info.loc[df_county_heat_info['County'].isna()] #ok, now that our conflicts resolved we can drop or inner merge to lose these non-US locations

Unnamed: 0,rg_county_repaired,count,County,2011,2012,2013,2014
1145,", Chukotskiy Avtonomnyy Okrug",2,,,,,
1334,", British Columbia",2,,,,,
1390,", Ontario",2,,,,,
1461,", San Juan",1,,,,,
1466,", Mangilao",1,,,,,
1589,"Bedford City, Virginia",1,,,,,
1744,", Quebradillas",1,,,,,
1851,", Baja California",1,,,,,
1935,", St. Thomas Island",1,,,,,


In [204]:
#having a sneaking suspicion we want to go back to the original data and us rg to catch missing locations, let's try it
df_cleaned['lat_long'] = list(zip(df_cleaned['latitude'],df_cleaned['longitude']))
df_cleaned['rg_address'] = rg.search(list(df_cleaned['lat_long']))

df_cleaned['rg_address'][900]

{'lat': '35.14953',
 'lon': '-90.04898',
 'name': 'Memphis',
 'admin1': 'Tennessee',
 'admin2': 'Shelby County',
 'cc': 'US'}

In [205]:
df_cleaned.head()

Unnamed: 0,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address
0,1949-10-10 20:30:00,4/27/2004,1949,10,20,Autumn,USA,United States,Texas,San Marcos,29.883056,-97.941111,Cylinder,2700.0,45 minutes,This event took place in early fall around 194...,Elongated-Symmetric,"(29.8830556, -97.9411111)","{'lat': '29.88327', 'lon': '-97.94139', 'name'..."
1,1949-10-10 21:00:00,12/16/2005,1949,10,21,Autumn,USA,United States,Texas,Bexar County,29.38421,-98.581082,Light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,Light-Only,"(29.38421, -98.581082)","{'lat': '29.38663', 'lon': '-98.61797', 'name'..."
2,1955-10-10 17:00:00,1/21/2008,1955,10,17,Autumn,GBR,United Kingdom,England,Chester,53.2,-2.916667,Circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,Rounded,"(53.2, -2.916667)","{'lat': '53.20832', 'lon': '-2.9253', 'name': ..."
3,1956-10-10 21:00:00,1/17/2004,1956,10,21,Autumn,USA,United States,Texas,Edna,28.978333,-96.645833,Circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,Rounded,"(28.9783333, -96.6458333)","{'lat': '28.97859', 'lon': '-96.64609', 'name'..."
4,1960-10-10 20:00:00,1/22/2004,1960,10,20,Autumn,USA,United States,Hawaii,Kaneohe,21.418056,-157.803611,Light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,Light-Only,"(21.4180556, -157.8036111)","{'lat': '21.40929', 'lon': '-157.80092', 'name..."


In [215]:
df_cleaned['USA_bool']= [dict['cc']=='US' for dict in df_cleaned['rg_address']]

In [258]:
df_USA = df_cleaned[df_cleaned['USA_bool']==True]

In [217]:
df_USA.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70901 entries, 0 to 80327
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date_time                    70901 non-null  object 
 1   date_documented              70901 non-null  object 
 2   Year                         70901 non-null  int64  
 3   Month                        70901 non-null  int64  
 4   Hour                         70901 non-null  int64  
 5   Season                       70901 non-null  object 
 6   Country_Code                 70816 non-null  object 
 7   Country                      70816 non-null  object 
 8   Region                       70816 non-null  object 
 9   Locale                       70814 non-null  object 
 10  latitude                     70901 non-null  float64
 11  longitude                    70901 non-null  float64
 12  UFO_shape                    70901 non-null  object 
 13  length_of_encounter_s

In [218]:
df_USA.tail()

Unnamed: 0,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,latitude,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,USA_bool
80323,2013-09-09 21:15:00,9/30/2013,2013,9,21,Autumn,USA,United States,Tennessee,Nashville,36.165833,-86.784444,Light,600.0,10 minutes,Round from the distance/slowly changing colors...,Light-Only,"(36.1658333, -86.7844444)","{'lat': '36.16589', 'lon': '-86.78444', 'name'...",True
80324,2013-09-09 22:00:00,9/30/2013,2013,9,22,Autumn,USA,United States,Idaho,Boise,43.613611,-116.2025,Circle,1200.0,20 minutes,Boise&#44 ID&#44 spherical&#44 20 min&#44 10 r...,Rounded,"(43.6136111, -116.2025)","{'lat': '43.6135', 'lon': '-116.20345', 'name'...",True
80325,2013-09-09 22:00:00,9/30/2013,2013,9,22,Autumn,USA,United States,California,Napa Abajo,38.297222,-122.284444,Other,1200.0,hour,Napa UFO&#44,Other,"(38.2972222, -122.2844444)","{'lat': '38.29714', 'lon': '-122.28553', 'name...",True
80326,2013-09-09 22:20:00,9/30/2013,2013,9,22,Autumn,USA,United States,Virginia,Vienna,38.901111,-77.265556,Circle,5.0,5 seconds,Saw a five gold lit cicular craft moving fastl...,Rounded,"(38.9011111, -77.2655556)","{'lat': '38.90122', 'lon': '-77.26526', 'name'...",True
80327,2013-09-09 23:00:00,9/30/2013,2013,9,23,Autumn,USA,United States,Oklahoma,Edmond,35.652778,-97.477778,Cigar,1020.0,17 minutes,2 witnesses 2 miles apart&#44 Red &amp; White...,Elongated-Symmetric,"(35.6527778, -97.4777778)","{'lat': '35.65283', 'lon': '-97.4781', 'name':...",True


In [272]:
def county_parse(dict):
    county = str(dict['admin2']+', '+dict['admin1'])
    if county in county_name_conflict:
        return county_conflict_dict[county]
    else: 
        return county

In [273]:
df_USA['rg_county'] = [county_parse(dict) for dict in df_USA['rg_address']]
df_USA.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_USA['rg_county'] = [county_parse(dict) for dict in df_USA['rg_address']]


Unnamed: 0,Date_time,date_documented,Year,Month,Hour,Season,Country_Code,Country,Region,Locale,...,longitude,UFO_shape,length_of_encounter_seconds,Encounter_Duration,Description,UFO_shape_binned,lat_long,rg_address,USA_bool,rg_county
0,1949-10-10 20:30:00,4/27/2004,1949,10,20,Autumn,USA,United States,Texas,San Marcos,...,-97.941111,Cylinder,2700.0,45 minutes,This event took place in early fall around 194...,Elongated-Symmetric,"(29.8830556, -97.9411111)","{'lat': '29.88327', 'lon': '-97.94139', 'name'...",True,"Hays County, Texas"
1,1949-10-10 21:00:00,12/16/2005,1949,10,21,Autumn,USA,United States,Texas,Bexar County,...,-98.581082,Light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,Light-Only,"(29.38421, -98.581082)","{'lat': '29.38663', 'lon': '-98.61797', 'name'...",True,"Bexar County, Texas"
3,1956-10-10 21:00:00,1/17/2004,1956,10,21,Autumn,USA,United States,Texas,Edna,...,-96.645833,Circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,Rounded,"(28.9783333, -96.6458333)","{'lat': '28.97859', 'lon': '-96.64609', 'name'...",True,"Jackson County, Texas"
4,1960-10-10 20:00:00,1/22/2004,1960,10,20,Autumn,USA,United States,Hawaii,Kaneohe,...,-157.803611,Light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,Light-Only,"(21.4180556, -157.8036111)","{'lat': '21.40929', 'lon': '-157.80092', 'name...",True,"Honolulu County, Hawaii"
5,1961-10-10 19:00:00,4/27/2007,1961,10,19,Autumn,USA,United States,Tennessee,Bristol,...,-82.188889,Sphere,300.0,5 minutes,My father is now 89 my brother 52 the girl wit...,Rounded,"(36.595, -82.1888889)","{'lat': '36.59511', 'lon': '-82.18874', 'name'...",True,"Sullivan County, Tennessee"


In [274]:
df_USA_recent = df_USA[df_USA['Year']>2010]

In [275]:
df_USA_recent.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20095 entries, 230 to 80327
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date_time                    20095 non-null  object 
 1   date_documented              20095 non-null  object 
 2   Year                         20095 non-null  int64  
 3   Month                        20095 non-null  int64  
 4   Hour                         20095 non-null  int64  
 5   Season                       20095 non-null  object 
 6   Country_Code                 20076 non-null  object 
 7   Country                      20076 non-null  object 
 8   Region                       20076 non-null  object 
 9   Locale                       20075 non-null  object 
 10  latitude                     20095 non-null  float64
 11  longitude                    20095 non-null  float64
 12  UFO_shape                    20095 non-null  object 
 13  length_of_encounter

In [294]:
df_county_heat =df_USA['rg_county'].value_counts().reset_index()

In [295]:
df_county_heat = df_county_heat.merge(df_county_pop_weights,how='left',left_on='rg_county',right_on='County')

df_county_heat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2799 entries, 0 to 2798
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rg_county  2799 non-null   object 
 1   count      2799 non-null   int64  
 2   County     2799 non-null   object 
 3   2011       2799 non-null   float64
 4   2012       2799 non-null   float64
 5   2013       2799 non-null   float64
 6   2014       2799 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 153.2+ KB


In [279]:
more_conflicts = list(df_county_heat['rg_county'].loc[df_county_heat['County'].isna()])

more_conflicts

[]

In [270]:
county_name_conflict=[', New York',
 'Saint Charles County, Missouri',
 'City of Saint Louis, Missouri',
 'Saint Louis County, Missouri',
 'City of Virginia Beach, Virginia',
 'Saint Lucie County, Florida',
 'Saint Louis County, Minnesota',
 'City of Richmond, Virginia',
 'City of Baltimore, Maryland',
 'Saint Johns County, Florida',
 ', Washington, D.C.',
 'Dona Ana County, New Mexico',
 'City of Fredericksburg, Virginia',
 'Saint Joseph County, Indiana',
 'City of Chesapeake, Virginia',
 'City of Alexandria, Virginia',
 'Saint Tammany Parish, Louisiana',
 'City of Hampton, Virginia',
 'City of Norfolk, Virginia',
 'Saint Clair County, Michigan',
 'Saint Clair County, Illinois',
 'Saint Croix County, Wisconsin',
 'City of Roanoke, Virginia',
 'City of Winchester, Virginia',
 'City of Lynchburg, Virginia',
 'De Soto County, Mississippi',
 'City of Charlottesville, Virginia',
 'City of Newport News, Virginia',
 'Saint Francois County, Missouri',
 'City of Fairfax, Virginia',
 'City of Manassas, Virginia',
 'City of Danville, Virginia',
 "Saint Mary's County, Maryland",
 'City of Falls Church, Virginia',
 'City of Hopewell, Virginia',
 'City of Staunton, Virginia',
 'City of Portsmouth, Virginia',
 'City of Harrisonburg, Virginia',
 'City of Waynesboro, Virginia',
 'City of Bristol, Virginia',
 'Saint Bernard Parish, Louisiana',
 'City of Petersburg, Virginia',
 ', Chukotskiy Avtonomnyy Okrug',
 'Saint Charles Parish, Louisiana',
 'City of Williamsburg, Virginia',
 'Saint Joseph County, Michigan',
 ', British Columbia',
 'City of Suffolk, Virginia',
 ', Ontario',
 'City of Radford, Virginia',
 ', San Juan',
 'City of Colonial Heights, Virginia',
 'City of Poquoson, Virginia',
 'Saint Clair County, Alabama',
 'Saint Landry Parish, Louisiana',
 'City of Franklin, Virginia',
 'City of Buena Vista, Virginia',
 'City of Bedford, Virginia',
 ', Mangilao',
 'City of Covington, Virginia',
 ', Baja California',
 'City of Martinsville, Virginia',
 ', Quebradillas',
 'City of Emporia, Virginia',
 ', Saint Thomas Island',
 'Saint Clair County, Missouri',
 'Saint Mary Parish, Louisiana',
 'Wade Hampton Census Area, Alaska',
 'Bronx, New York',
 'Shannon County, South Dakota',
 'Annette Island Reserve, Alaska',
 'City of Salem, Virginia',
 'City and Borough of Wrangell, Alaska',
 'City of Lexington, Virginia',
 'Saint Francis County, Arkansas',
 'Saint Martin Parish, Louisiana',
 'Saint James Parish, Louisiana',
 'La Salle Parish, Louisiana',
 'Sainte Genevieve County, Missouri',
 'City of Norton, Virginia',
 'Saint Helena Parish, Louisiana',
 'Wrangell-Petersburg Census Area, Alaska',
 'City of Galax, Virginia']
len(county_name_conflict)


83

In [296]:
df_county_heat['year_counts'] = [df_USA_recent['Year'].loc[df_USA_recent['rg_county']==county].value_counts() for county in df_county_heat['rg_county']]

In [297]:

df_county_heat.head()

Unnamed: 0,rg_county,count,County,2011,2012,2013,2014,year_counts
0,"Los Angeles County, California",1983,"Los Angeles County, California",0.0317,0.031658,0.031622,0.031543,Year 2012 127 2013 126 2011 96 2014 ...
1,"Maricopa County, Arizona",1388,"Maricopa County, Arizona",0.012438,0.012578,0.012715,0.012861,Year 2013 116 2012 111 2011 62 2014 ...
2,"King County, Washington",1368,"King County, Washington",0.006337,0.006409,0.006479,0.006551,Year 2012 103 2011 77 2013 69 2014 ...
3,"Cook County, Illinois",890,"Cook County, Illinois",0.016754,0.016695,0.016623,0.016509,Year 2012 82 2011 46 2013 44 2014 ...
4,"San Diego County, California",888,"San Diego County, California",0.010069,0.010115,0.010158,0.010207,Year 2013 65 2011 62 2012 55 2014 ...


44