Things to do:
* Date/Time
    * verify dates are real (what to do with dates that are strange, e.g., pre-1900?)
* split date and time (?)
* convert times to UTC
* duration
    * split number and unit(e.g., seconds, minutes
    * convert duration to milliseconds
* location
    * pull all non-location information out of location fields (i.e., city, state)
    * add country column
    * validate all 3 columns
* summary information
    * creates columns for nouns, adjectives, verbs (?)
    * extra colors (new column)
* factorize
    * colors
    * shapes
   
References:

https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/

https://blog.dataiku.com/text-classification-the-first-step-toward-nlp-mastery

https://thinkinfi.com/complete-guide-for-natural-language-processing-in-python/

https://towardsdatascience.com/nlp-in-python-vectorizing-a2b4fc1a339e

In [1]:
!pip install country_list
!pip install contractions

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [257]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.util import ngrams

from textblob import TextBlob

import contractions
import country_list
from geopy.geocoders import ArcGIS
from tzwhere import tzwhere
import requests
from datetime import datetime
import pytz

In [3]:
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [194]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

def expand_contractions(text):
    expanded_words = []    
    for word in text.split():
        expanded_words.append(contractions.fix(word))   

    return ' '.join(expanded_words)

def spell_check(text):
    orig = TextBlob(text)
    text = orig.correct()
    return str(text)

def remove_others(text):
    text = re.sub(r'\n', "", text)
    text = re.sub(r'-', " ", text)
    text = text.strip()
    text = re.sub(r' +', " ", text)
    text = re.sub(r'[\(\)\[\]\^\$\+\*\.\?\/!@#%&{}\'\",;:]', "", text)
    
    return text

def clean_text(text):
    stop = set(nltk.corpus.stopwords.words('english'))
    
    cleaned = text.lower()
    # Remove stop words so they are not spellchecked
    tokens = word_tokenize(cleaned)
    cleaned = ' '.join([w for w in tokens if not w in stop])
    
    cleaned = spell_check(cleaned)
    cleaned = remove_others(cleaned)
    cleaned = expand_contractions(cleaned)
    # Try to remove them again in case spellcheck "added" stop words
    tokens = word_tokenize(cleaned)
    cleaned = ' '.join([w for w in tokens if not w in stop])
    cleaned = lemmatize_text(cleaned)
    return cleaned

In [5]:
def load_clean_sightings_dataframe():
    file_name = "sightings.pkl"
    sightings = pd.read_pickle(file_name)

    sightings = sightings[sightings['Summary'].str.contains('MADAR')==False]
    
    # The Detail_Summary column needs to be cleaned for the cleaning function to work.
    sightings["Detail_Summary"] = sightings["Detail_Summary"].fillna("")
    sightings.loc[sightings["Detail_Summary"] == "Summary detail page not found.", "Detail_Summary"] = ""
    
    print(f"Sightings: {len(sightings)}")

    sightings_cleaned = sightings.copy()
    sightings_cleaned['Detail_Summary_nltk'] = sightings_cleaned['Detail_Summary'].apply(clean_text)
    
    print(f"Cleaned Sightings: {len(sightings_cleaned)}")
    sightings_cleaned.to_pickle(cleaned_file_name)

In [6]:
cleaned_file_name = "sightings_cleaned.pkl"

In [7]:
start_fresh = False

if start_fresh:
    load_clean_sightings_dataframe()

In the next block we save the data to a file. The initial cleaning is on +95k records. This takes some time. In the remaining cleaning, if one messes up, as I have done quite a few times, they can re-run pd.read_pickle() which will restore the dataframe to the "just processed" phase.

In [8]:
sightings_cleaned = pd.read_pickle(cleaned_file_name)

In [82]:
# Replace empty Detail_Summary with Summary which may be truncated
indList = sightings_cleaned[(sightings_cleaned.Detail_Summary.isna())|(sightings_cleaned.Detail_Summary=="")].index
for index in indList:
    sightings_cleaned.loc[index].Detail_Summary = sightings_cleaned.loc[index].Summary

In [9]:
len(sightings_cleaned)

95690

In [10]:
sightings_cleaned.head(15)

Unnamed: 0,Date_Time,City,State,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk
0,4/23/21 06:30,Blackshear,GA,Circle,9 minutes,Very strange ((NUFORC Note: Rocket launch f...,4/23/21,http://www.nuforc.org/webreports/162/S162815.html,\nVery strangeI have recorded a video of this ...,strangei recorded video sighting
1,4/23/21 06:00,Mechanicsville,VA,Circle,Seconds,Ball in the sky ((NUFORC Note: Rocket launc...,4/23/21,http://www.nuforc.org/webreports/162/S162814.html,\nBall in the skyObject appears as a white bal...,ball skyobject appears white ball vapor strewi...
2,4/23/21 06:00,Vero Beach,FL,Light,5 minutes,I was driving and saw something strange in the...,4/23/21,http://www.nuforc.org/webreports/162/S162822.html,\nI was driving and saw something strange in t...,driving saw something strange sky pulled car i...
3,4/23/21 05:59,St. Augustine,FL,Light,3 minutes,2 extremely bright lights appeared over east c...,4/23/21,http://www.nuforc.org/webreports/162/S162824.html,\n2 extremely bright lights appeared over east...,2 extremely bright light appeared east coast n...
4,4/23/21 05:58,Durham,NC,Cone,>5 minutes,A cone of light coming from the sky unlike any...,4/23/21,http://www.nuforc.org/webreports/162/S162819.html,\nA cone of light coming from the sky unlike a...,cone light coming sky unlike anything ever see...
5,4/23/21 05:55,I-16 south,GA,Sphere,10 minutes,Noticed a intense light that was covering a la...,4/23/21,http://www.nuforc.org/webreports/162/S162823.html,\nDriving on I-16 south and noticed a intense ...,driving i16 south noticed intense light coveri...
6,4/23/21 05:54,Parrish,FL,Light,5 minutes,Two bright lights one flashing with a descendi...,4/23/21,http://www.nuforc.org/webreports/162/S162820.html,\nTwo bright lights one flashing with a descen...,two bright light one flashing descending expan...
7,4/23/21 05:45,Champions Gate,FL,Light,~10-15 minutes,Im former military and have never seen aircraf...,4/23/21,http://www.nuforc.org/webreports/162/S162826.html,\nIm former military and have never seen aircr...,I former military never seen aircraft that.inc...
8,4/23/21 05:45,Belleview,FL,Diamond,15-20 minutes,((NUFORC Note: Rocket launch from Cape Canav...,4/23/21,http://www.nuforc.org/webreports/162/S162821.html,\n ((NUFORC Note: Rocket launch from Cape Can...,nuforc note : rocket launch cape canaveral . pd
9,4/23/21 02:40,Firestone,CO,Chevron,3-4 seconds,"I witnessed a chevron-shaped object, silent an...",4/23/21,http://www.nuforc.org/webreports/162/S162827.html,"\nI witnessed a chevron-shaped object, silent ...","witnessed chevronshaped object , silent seven ..."


In [11]:
len(sightings_cleaned)

95690

Do we want to remove the ones below with NUFORC notes? Perhaps someone could go through them to see what the notes are. Things like Starlink sightings should be removed. For now, the following block removes them otherwise nuforc and note dominate the wordcloud.

In [12]:
sightings_cleaned = sightings_cleaned[sightings_cleaned['Detail_Summary_nltk'].str.contains('nuforc note')==False]
sightings_cleaned = sightings_cleaned[sightings_cleaned['Detail_Summary_nltk'].str.contains('NUFORC note')==False]
sightings_cleaned = sightings_cleaned[sightings_cleaned['Detail_Summary_nltk'].str.contains('nuforc')==False]
sightings_cleaned = sightings_cleaned[sightings_cleaned['Detail_Summary_nltk'].str.contains('NUFORC')==False]
len(sightings_cleaned)

67841

In [13]:
sightings_cleaned.head(10)

Unnamed: 0,Date_Time,City,State,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk
0,4/23/21 06:30,Blackshear,GA,Circle,9 minutes,Very strange ((NUFORC Note: Rocket launch f...,4/23/21,http://www.nuforc.org/webreports/162/S162815.html,\nVery strangeI have recorded a video of this ...,strangei recorded video sighting
1,4/23/21 06:00,Mechanicsville,VA,Circle,Seconds,Ball in the sky ((NUFORC Note: Rocket launc...,4/23/21,http://www.nuforc.org/webreports/162/S162814.html,\nBall in the skyObject appears as a white bal...,ball skyobject appears white ball vapor strewi...
2,4/23/21 06:00,Vero Beach,FL,Light,5 minutes,I was driving and saw something strange in the...,4/23/21,http://www.nuforc.org/webreports/162/S162822.html,\nI was driving and saw something strange in t...,driving saw something strange sky pulled car i...
3,4/23/21 05:59,St. Augustine,FL,Light,3 minutes,2 extremely bright lights appeared over east c...,4/23/21,http://www.nuforc.org/webreports/162/S162824.html,\n2 extremely bright lights appeared over east...,2 extremely bright light appeared east coast n...
4,4/23/21 05:58,Durham,NC,Cone,>5 minutes,A cone of light coming from the sky unlike any...,4/23/21,http://www.nuforc.org/webreports/162/S162819.html,\nA cone of light coming from the sky unlike a...,cone light coming sky unlike anything ever see...
5,4/23/21 05:55,I-16 south,GA,Sphere,10 minutes,Noticed a intense light that was covering a la...,4/23/21,http://www.nuforc.org/webreports/162/S162823.html,\nDriving on I-16 south and noticed a intense ...,driving i16 south noticed intense light coveri...
6,4/23/21 05:54,Parrish,FL,Light,5 minutes,Two bright lights one flashing with a descendi...,4/23/21,http://www.nuforc.org/webreports/162/S162820.html,\nTwo bright lights one flashing with a descen...,two bright light one flashing descending expan...
7,4/23/21 05:45,Champions Gate,FL,Light,~10-15 minutes,Im former military and have never seen aircraf...,4/23/21,http://www.nuforc.org/webreports/162/S162826.html,\nIm former military and have never seen aircr...,I former military never seen aircraft that.inc...
9,4/23/21 02:40,Firestone,CO,Chevron,3-4 seconds,"I witnessed a chevron-shaped object, silent an...",4/23/21,http://www.nuforc.org/webreports/162/S162827.html,"\nI witnessed a chevron-shaped object, silent ...","witnessed chevronshaped object , silent seven ..."
10,4/22/21 22:23,New York City (Brooklyn),NY,Fireball,2 minutes,Saw a steady pulsating fireball above that mov...,4/23/21,http://www.nuforc.org/webreports/162/S162818.html,\nSaw a steady pulsating fireball above that m...,saw steady pulsating fireball moved slowly awa...


The next section is cleaning up the city, state, and country column. To begin, we take cities that have () in them. We split that. Some of the parentheses have country or state in them, however, many do not have anything useful. Once cleaned, that needs to be merged back in and the general task of cleaning up the remaining location data may go forward.

In [14]:
# Backup the City column in case the notes may be useful
sightings_cleaned["Notes"] = sightings_cleaned[sightings_cleaned['City'].str.contains('\(')].City

In [15]:
sightings_cleaned[sightings_cleaned['City'].str.contains('\(')].City

10               New York City (Brooklyn)
11                      Firozabad (India)
24               New York City (Brooklyn)
33                       Nanaimo (Canada)
95                Merseyside (UK/England)
                       ...               
97163                  Saskatoon (Canada)
97193                    Kelowna (Canada)
97232                  Kitchener (Canada)
97240       Little Rock (5 miles west of)
97249    Long Island (Westchester County)
Name: City, Length: 7009, dtype: object

In [16]:
# df is a temporary dataframe so that I can clean cities with a () in them. The following is all of that work.
# Eventually, this could be merged into the main dataframe, or this code could be applied to that dataframe
# when we are confident it works.
df = sightings_cleaned[sightings_cleaned['City'].str.contains('\(')].City.str.split("\(([^)]+)", expand= True)
try:
    df.columns = ["City", "Country", "EndParenth", "Empty1", "Empty2", "Empty3", "Empty4"]
except:
    df.columns = ["City", "Country", "EndParenth", "Empty1", "Empty2"]
df.drop(["EndParenth"], axis=1, inplace = True)
df["City"] = df["City"].str.strip()
df

for index, row in df.iterrows():
    df.loc[index, "State"] = sightings_cleaned.loc[index].State

In [17]:
df["ignore"] = False

In [18]:
countries = dict(country_list.countries_for_language('en'))
cities_file = "us_cities_states_counties.csv"
cities_df = pd.read_csv(cities_file, delimiter="|")
city_list = cities_df.City.unique().tolist()

cities_df["State short"].unique()

array(['NY', 'PR', 'VI', 'MA', 'RI', 'NH', 'ME', 'VT', 'CT', 'NJ', 'AE',
       'PA', 'DE', 'DC', 'VA', 'MD', 'WV', 'NC', 'SC', 'GA', 'FL', nan,
       'AL', 'TN', 'MS', 'KY', 'OH', 'IN', 'MI', 'IA', 'WI', 'MN', 'SD',
       'ND', 'MT', 'IL', 'MO', 'KS', 'NE', 'LA', 'AR', 'OK', 'TX', 'CO',
       'WY', 'ID', 'UT', 'AZ', 'NM', 'NV', 'CA', 'AP', 'HI', 'AS', 'GU',
       'PW', 'FM', 'MP', 'MH', 'OR', 'WA', 'AK'], dtype=object)

In [19]:
# This record is "NA" for the country. Have to fix that or the next few things will throw an error.
# Ask me how I know that.
df.loc[89338, "Country"] = "USA"

In [20]:
# Do we want these as England, Wales, etc.?
df.loc[df['Country'].str.contains('UK'), "Country"] = "United Kingdom"
df.loc[df['Country'].str.contains('Northern Ireland'), "Country"] = "United Kingdom"
df.loc[df['City'].str.contains('UK/England'), "Country"] = "United Kingdom"
df.loc[df['City'].str.contains('UK/England'), "City"] = ""
df.loc[df['Country'].str.contains('UK'), 'Country'].unique()

array([], dtype=object)

In [21]:
df.loc[(df['Country'].isin(countries.values())==False)&(df.Empty1.isnull()==False), "Country"] = df["Empty1"]
df.loc[df.Empty1 == "Canada", "Country"] = "Canada"

In [22]:
df.loc[df['Country'].str.contains('Brooklyn'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Bronx'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Brookline'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Westchester County'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Baja'), "Country"] = "Mexico"
df.loc[df['Country'].str.contains('Manhattan'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Bronx'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Queens'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Watts'), "Country"] = "United States"
df.loc[df['City'].str.contains('Warsaw/Clinton'), "Country"] = "United States"
df.loc[df['Country'].str.contains('USA'), "Country"] = "United States"
df.loc[df['Country'].str.contains('Calgary'), "Country"] = "Canada"
df.loc[df['Country'].str.contains('Wilhelmsburg'), "Country"] = "Germany"
df.loc[df['Country'].str.contains('German'), "Country"] = "Germany"
df.loc[df['Country'].str.contains('Czech Republic'), "Country"] = "Czechia"
df.loc[df['Country'].str.contains('Punjab'), "Country"] = "India"
df.loc[df['Country'].str.contains('West Germany'), "Country"] = "Germany"
df.loc[df['Country'].str.contains('Brasil'), "Country"] = "Brazil"
df.loc[df['Country'].str.contains('Macedonia'), "Country"] = "North Macedonia"
df.loc[df['Country'].str.contains('México'), "Country"] = "Mexico"
df.loc[df['Country'].str.contains('Western Australia'), "Country"] = "Australia"
df.loc[97083, "City"] = "Boston"
df.loc[df['Country'].str.contains('Bosnia'), 'Country'] = "Bosnia & Herzegovina"
df.loc[(df['Country'].str.contains("Australia")), "Country"] = "Australia"
df.loc[(df['Country'].str.contains("Australi")), "Country"] = "Australia"
df.loc[(df['Country'].str.contains("Rep. of Ireland")), "Country"] = "Ireland"
df.loc[(df['Country'].str.contains("Republic of Ireland")), "Country"] = "Ireland"
df.loc[df['Country'].str.contains('U.A.E.'), "Country"] = "United Arab Emirates"
df.loc[df['Country'].str.contains('Oman/UAE'), "Country"] = "United Arab Emirates"
df.loc[df['Country'].str.contains('UAE'), "Country"] = "United Arab Emirates"
df.loc[df['Country'].str.contains('UAR'), "Country"] = "United Arab Emirates"
df.loc[df['Country'].str.contains('U.A.R.'), "Country"] = "United Arab Emirates"
df.loc[df['Country'].str.contains('Dubai'), "Country"] = "United Arab Emirates"
df.loc[(df['City']=="Leduc"), "Country"] = "Canada"
df.loc[df['Country'].str.contains('Surinam'), "Country"] = "Suriname"
df.loc[df['Country'].str.contains('St. Helena Island'), "Country"] = "St. Helena"
df.loc[df['Country'].str.contains('El Poblado'), "City"] = "Medellin"
df.loc[df['Country'].str.contains('El Poblado'), "Country"] = "Colombia"
df.loc[df['Country'].str.contains('Tobago'), "Country"] = "Trinidad & Tobago"
df.loc[df['Country'].str.contains('Trinidad'), "Country"] = "Trinidad & Tobago"
df.loc[df['Country'].str.contains('Viet nam'), "Country"] = "Vietnam"
df.loc[df['Country'].str.contains('Viet Nam'), "Country"] = "Vietnam"
df.loc[df['Country'].str.contains('Hidalgo'), "State"] = "Hidalgo"
df.loc[df['Country'].str.contains('Hidalgo'), "Country"] = "Mexico"
df.loc[df['Country'].str.contains('Netherlands'), "Country"] = "Netherlands"
df.loc[df['City'].str.contains("U. S."), "Country"] = "United States"
df.loc[(df['City'].str.contains("U. S.")), "State"] = ""
df.loc[(df['City'].str.contains("U. S.")), "City"] = ""
df.loc[(df['Country'].str.contains("Nauru")), "Country"] = "Nauru"
df.loc[(df['Country'].str.contains("Menorca")), "State"] = "Menorca"
df.loc[(df['Country'].str.contains("Menorca")), "Country"] = "Spain"
df.loc[(df['Country'].str.contains("Riu Palace Hotel")), "City"] = "Playa Matapalo"
df.loc[(df['Country'].str.contains("Riu Palace Hotel")), "Country"] = "Costa Rica"
df.loc[(df['City'].str.contains("Australia")), "Country"] = "Australia"
df.loc[(df['City'].str.contains("Australia")), "City"] = ""
df.loc[(df['Country'].str.contains("Cyprus")), "Country"] = "Cyprus"
df.loc[(df['Country'].str.contains("Oman")), "Country"] = "Oman"
df.loc[(df['Country'].str.contains("Gibralter")), "Country"] = "Gibralter"
df.loc[(df['City'].str.contains("Farallon")), "Country"] = "Panama"
df.loc[(df['Country'].str.contains("Guatamala")), "Country"] = "Guatemala"
df.loc[(df['Country'].str.contains("Cayman")), "Country"] = "Cayman Islands"
df.loc[(df['City'].str.contains("Cayman")), "Country"] = "Cayman Islands"
df.loc[(df['City'].str.contains("Cayman")), "City"] = ""
df.loc[(df['Country'].str.contains("Virgin Islands")), "Country"] = "U.S. Virgin Islands"
df.loc[(df['City'].str.contains("Maldives")), "Country"] = "Maldives"
df.loc[(df['City'].str.contains("Maldives")), "City"] = "Meerufenfushi island"
df.loc[(df['Country'].str.contains("Papua/New Guinea")), "Country"] = "Papua New Guinea"
df.loc[(df['Country'].str.contains("Yorkshire")), "State"] = "Yorkshire"
df.loc[(df['Country'].str.contains("Yorkshire")), "City"] = ""
df.loc[(df['Country'].str.contains("Yorkshire")), "Country"] = "United Kingdom"
df.loc[(df['Country'].str.contains("Kazakstan")), "Country"] = "Kazakhstan"
df.loc[(df['Country'].str.contains("Taiwan")), "Country"] = "Taiwan"
df.loc[(df['Country'].str.contains("Hampshire")), "State"] = "Hampshire"
df.loc[(df['Country'].str.contains("Hampshire")), "Country"] = "United Kingdom"
df.loc[(df['City'].str.contains("Atlantic Ocean")), "Country"] = "Atlantic Ocean"
df.loc[(df['City'].str.contains("Atlantic Ocean")), "City"] = ""
df.loc[(df['City'].str.contains("Pacific Ocean")), "Country"] = "Pacific Ocean"
df.loc[(df['City'].str.contains("Pacific Ocean")), "City"] = ""
df.loc[(df['City'].str.contains("Indian Ocean")), "Country"] = "Indian Ocean"
df.loc[(df['City'].str.contains("Indian Ocean")), "City"] = ""
df.loc[(df['City'].str.contains("SE Arizona")), "State"] = "AZ"
df.loc[(df['City'].str.contains("SE Arizona")), "Country"] = "United States"
df.loc[(df['City'].str.contains("SE Arizona")), "City"] = ""
df.loc[(df['City'].str.contains("Dominican Republic")), "Country"] = "Dominican Republic"
df.loc[(df['City'].str.contains("Dominican Republic")), "City"] = ""
df.loc[(df['City'].str.contains("Rockhampton")), "Country"] = "Australia"
df.loc[(df['Country'].str.contains('Camp "New Jersey"')), "Country"] = "Kuwait"
df.loc[(df['Country'].str.contains('Camp "New Jersey"')), "City"] = ""
df.loc[(df['City'].str.contains("Shanghai")), "City"] = "Shanghai"
df.loc[(df['City'].str.contains("Shanghai")), "Country"] = "China"
df.loc[(df['City'].str.contains("Bloemfontein")), "Country"] = "South Africa"
df.loc[(df['City'].str.contains("France")), "Country"] = "France"
df.loc[(df['Country'].str.contains("North part of France")), "Country"] = "France"
df.loc[(df['City'].str.contains("France")), "City"] = ""
df.loc[(df['City'].str.contains("Urmston")), "State"] = "Manchester"
df.loc[(df['City'].str.contains("Urmston")), "Country"] = "United Kingdom"
df.loc[(df['City'].str.contains("Sweden")), "Country"] = "Sweden"
df.loc[(df['City'].str.contains("Sweden")), "City"] = ""
df.loc[(df['City'].str.contains("Songtan/Pyongtaek")), "Country"] = "South Korea"
df.loc[(df['City'].str.contains("Pyongtaek")), "City"] = "Pyongtaek"
df.loc[(df['City'].str.contains("Seoul")), "Country"] = "South Korea"
df.loc[(df['City'].str.contains("Afghanistan")), "Country"] = "Afghanistan"
df.loc[(df['City'].str.contains("Afghanistan")), "City"] = ""
df.loc[(df['Country'].str.contains("Curacao")), "City"] = "Curacao"
df.loc[(df['Country'].str.contains("Curacao")), "Country"] = "Caribbean Netherlands"
df.loc[(df['City'].str.contains("Praia")), "Country"] = "Cape Verde"
df.loc[(df['City'].str.contains("Faliraki")), "Country"] = "Greece"
df.loc[(df['City'].str.contains("Argentina")), "Country"] = "Argentina"
df.loc[(df['City'].str.contains("Argentina")), "City"] = ""
df.loc[(df['Country'].str.contains("Ochorios")), "City"] = "Ochorios"
df.loc[(df['Country'].str.contains("Ochorios")), "Country"] = "Jamaica"
df.loc[(df['Country'].str.contains("Myanmar")), "Country"] = "Myanmar (Burma)"
df.loc[(df['Country'].str.contains("Phoenix to NYC")), "Country"] = "United States"
df.loc[(df['Country'].str.contains("Turks & Caicos")), "Country"] = "Turks & Caicos Islands"
df.loc[(df['Country'].str.contains("Italy/Greece")), "Country"] = "Italy"
df.loc[(df['City'].str.contains("Hermosillo/Obregon City")), "Country"] = "Mexico"
df.loc[(df['City'].str.contains("Hermosillo/Obregon City")), "City"] = "Ciudad Obregón"
df.loc[(df['City'].str.contains("Manchester area")), "Country"] = "United Kingdom"
df.loc[(df['City'].str.contains("Manchester area")), "City"] = "Manchester"
df.loc[(df['Country'].str.contains("U. S. and Canada")), "Country"] = "United States, Canada"
df.loc[(df['City'].str.contains("Vancouver Bc")), "State"] = "British Columbia"
df.loc[(df['City'].str.contains("Vancouver Bc")), "City"] = "Vancouver"
df.loc[(df['City'].str.contains("Barksdale AFB")), "City"] = "Bossier City"


In [23]:
df[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City))].Empty1.unique()

array(['in-flight', None, '"Abiquiu"', 'Northern Ireland', 'UK/England',
       'UK/Scotland', 'Victoria', 'pilot report', 'Riverside',
       'Republic of South Africa', 'UK/Wales', 'near', 'in flight',
       'Boston'], dtype=object)

In [24]:
df[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City))].Empty2.unique()

array([')', None, ''], dtype=object)

In [25]:
df[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City))].Empty3.unique()

array([None, 'Australia'], dtype=object)

In [26]:
df[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City))].Empty4.unique()

array([None, ')'], dtype=object)

In [27]:
# Country not in Country list
# Cities not in the US city list
# cities_df[cities_df.City == "Canada"] ----> Canada is a city in Kentucky
# df[(df['Country'].isin(countries.values())==False)]

# Cities in the US city list
#df[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City)), "Country"] = "United States"
df.loc[df['Empty1'].isnull(), "Empty1"] = " "
df.loc[df['Empty2'].isnull(), "Empty2"] = " "
df.loc[df['Empty3'].isnull(), "Empty3"] = " "
df.loc[df['Empty4'].isnull(), "Empty4"] = " "
df.loc[df['Empty3'].str.contains('Australia'), "Country"] = "Australia"
df.loc[df['Empty1'].str.contains('Northern Ireland'), "Country"] = "United Kingdom"
df.loc[df['Empty1'].str.contains('UK/England'), "Country"] = "United Kingdom"
df.loc[df['Empty1'].str.contains('UK/Wales'), "Country"] = "United Kingdom"
df.loc[df['Empty1'].str.contains('UK/Scotland'), "Country"] = "United Kingdom"
df.loc[df['Empty1'].str.contains('Republic of South Africa'), "Country"] = "South Africa"


In [28]:
df.drop(["Empty1", "Empty2", "Empty3", "Empty4"], axis=1, inplace=True)

In [29]:
df.loc[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City)), "Country"] = "United States"
df.loc[(df['Country'].isin(countries.values())==False)&(df['State'].isin(cities_df["State short"])==True)]

# Country not in the list of countries
# State is the list of US cities State field
# Changing the Country to United States
df.loc[(df['Country'].isin(countries.values())==False)&(df['State'].isin(cities_df["State short"])==True), "Country"] = "United States"

In [30]:
df.loc[(df['Country'].isin(countries.values())==False)&(df['City'].isin(cities_df.City)==False)&df.Country.str.contains(";")]

df.loc[46878, "Country"] = "Lesotho"
df.loc[23766, "Country"] = "Japan"
df.loc[23766, "City"] = "Hiroshima"
df.loc[27598, "Country"] = "India"
df.loc[58864, "Country"] = "Russia"
df.loc[71552, "Country"] = "Tunisia"
df.loc[71552, "City"] = ""
df.loc[84105, "Country"] = "Thailand"
df.loc[96267, "Country"] = "Vietnam"
df.loc[96267, "City"] = "Vietnam"
df.loc[94772, "Country"] = "Australia"
df.loc[38099, "Country"] = "Israel"
df.loc[85560, "Country"] = "U.S. Virgin Islands"
df.loc[7900, "Country"] = "U.S. Virgin Islands"
df.loc[94486, "City"] = ""
df.loc[94486, "Country"] = "France"
df.loc[79771, "City"] = ""
df.loc[79771, "Country"] = ""
df.loc[92408, "Country"] = "Laos"
df.loc[92408, "City"] = "Luang Phabang"
df.loc[7219, "Country"] = "Costa Rica"
df.loc[12183, "Country"] = "Canada"
df.loc[12183, "State"] = "British Columbia"
df.loc[7075, "Country"] = "Canada"
df.loc[7075, "State"] = "Quebec"
df.loc[848, "Country"] = "United Arab Emirates"
df.loc[7924, "Country"] = "Panama"
df.loc[8131, "Country"] = "Canada"
df.loc[80866, "Country"] = ""
df.loc[80866, "City"] = ""
df.loc[80866, "State"] = ""


In [31]:
df.loc[(df['City'].str.contains("Ocean"))&(df['City'].str.contains("Ocean City")==False), "State"] = ""
df.loc[(df['City'].str.contains("Ocean"))&(df['City'].str.contains("Ocean City")==False), "Country"] = ""
df.loc[(df['City'].str.contains("Ocean"))&(df['City'].str.contains("Ocean City")==False)]

Unnamed: 0,City,Country,State,ignore
30818,Oceanside,,,False
40870,Oceanside,,,False


In [32]:
keywords = ["in flight", "inflight", "at sea", "between", "unknown", "oil rig", "ship", "unspecified", "ISS", "in most"]
df.loc[(df['Country'].isin(countries.values())==False)&(df['Country'].str.contains("|".join(keywords))), "ignore"] = True

In [33]:
df.loc[(df['Country'].isin(countries.values())==False), "ignore"]= True

In [34]:
for index, row in df[df['City'].isin(cities_df["City alias"])].iterrows():
    df.loc[index, "city alias"] = df.loc[index, "City"]
    df.loc[index, "City"] = cities_df.loc[cities_df['City alias'] == row.City, "City"].tolist()[0]

In [35]:
df_merge = df[df.ignore == False]

In [36]:
sightings_cleaned = sightings_cleaned.join(df_merge[["City", "State", "Country"]], lsuffix='_left', rsuffix='_right')
sightings_cleaned.loc[sightings_cleaned["City_right"].isnull() == False, "City_left"] = sightings_cleaned.loc[sightings_cleaned["City_right"].isnull() == False, "City_right"]
sightings_cleaned.loc[sightings_cleaned["State_right"].isnull() == False, "State_left"] = sightings_cleaned.loc[sightings_cleaned["State_right"].isnull() == False, "State_right"]
sightings_cleaned["City"] = sightings_cleaned["City_left"]
sightings_cleaned["State"] = sightings_cleaned["State_left"]
sightings_cleaned.drop(["City_right", "City_left", "State_right", "State_left"], axis=1, inplace=True)

In [37]:
len(sightings_cleaned)

67841

In [38]:
# Records that have a country
sightings_cleaned[sightings_cleaned.Country.isnull()==False]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State
10,4/22/21 22:23,Fireball,2 minutes,Saw a steady pulsating fireball above that mov...,4/23/21,http://www.nuforc.org/webreports/162/S162818.html,\nSaw a steady pulsating fireball above that m...,saw steady pulsating fireball moved slowly awa...,New York City (Brooklyn),United States,New York,NY
11,4/22/21 21:00,Changing,6 minutes,Boomarang like shaped with five lights made a ...,4/23/21,http://www.nuforc.org/webreports/162/S162804.html,\nBoomarang like shaped with five lights made ...,boomarang like shaped five light made speed bi...,Firozabad (India),India,Firozabad,
24,4/20/21 20:30,Flash,3:39 minutes,"At approximately 8:30pm eastern time, I notice...",4/23/21,http://www.nuforc.org/webreports/162/S162797.html,"\nAt approximately 8:30pm eastern time, I noti...","approximately 8:30pm eastern time , noticed 1 ...",New York City (Brooklyn),United States,New York,NY
33,4/18/21 21:08,Light,15 seconds,Distant Satellite -Erratic Flight Path,4/23/21,http://www.nuforc.org/webreports/162/S162779.html,\nDistant Satellite -Erratic Flight PathAbout ...,distant satellite erratic flight pathabout hou...,Nanaimo (Canada),Canada,Nanaimo,BC
95,4/10/21 19:00,Oval,3 seconds,"Mixed silver oval shape, sunlit reflexion cent...",4/23/21,http://www.nuforc.org/webreports/162/S162596.html,"\nMixed silver oval shape, sunlit reflexion ce...","mixed silver oval shape , sunlit reflexion cen...",Merseyside (UK/England),United Kingdom,Merseyside,
...,...,...,...,...,...,...,...,...,...,...,...,...
97163,05# 00:45,Light,2 minutes,We seemed to know to look up and and for me it...,5/22/15,http://www.nuforc.org/webreports/119/S119038.html,\nWe seemed to know to look up and and for me ...,seemed know look knew watched.my daughter leav...,Saskatoon (Canada),Canada,Saskatoon,SK
97193,,,,"They were close to my home, hope they come soo...",7/3/13,http://www.nuforc.org/webreports/098/S98603.html,"\nthey were close to my home, hope they come s...","close home , hope come soon .... I waiting ......",Kelowna (Canada),Canada,Kelowna,BC
97232,,Triangle,,Let w flying triangular craft..comp silence..f...,7/9/20,http://www.nuforc.org/webreports/157/S157335.html,\nLet w flying triangular craft..comp silence....,let w flying triangular craft .. comp silence ...,Kitchener (Canada),Canada,Kitchener,ON
97240,,Unknown,25 minutes,A bright orange light hovering to the west of ...,1/31/20,http://www.nuforc.org/webreports/152/S152751.html,\nA bright orange light hovering to the west o...,bright orange light hovering west property hig...,Little Rock (5 miles west of),United States,Little Rock,AR


In [39]:
# Record without a country but have a state that is in the US state list
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.State.isin(cities_df["State short"].unique())), "Country"] = "United States"

In [40]:
# Record without a country but have a state that is NOT in the US state list
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.State.isin(cities_df["State short"].unique()==False))]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State


In [41]:
sightings_cleaned[(sightings_cleaned['City'].isin(cities_df["City alias"]))&(sightings_cleaned["Country"]=="United States")]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State
0,4/23/21 06:30,Circle,9 minutes,Very strange ((NUFORC Note: Rocket launch f...,4/23/21,http://www.nuforc.org/webreports/162/S162815.html,\nVery strangeI have recorded a video of this ...,strangei recorded video sighting,,United States,Blackshear,GA
1,4/23/21 06:00,Circle,Seconds,Ball in the sky ((NUFORC Note: Rocket launc...,4/23/21,http://www.nuforc.org/webreports/162/S162814.html,\nBall in the skyObject appears as a white bal...,ball skyobject appears white ball vapor strewi...,,United States,Mechanicsville,VA
2,4/23/21 06:00,Light,5 minutes,I was driving and saw something strange in the...,4/23/21,http://www.nuforc.org/webreports/162/S162822.html,\nI was driving and saw something strange in t...,driving saw something strange sky pulled car i...,,United States,Vero Beach,FL
4,4/23/21 05:58,Cone,>5 minutes,A cone of light coming from the sky unlike any...,4/23/21,http://www.nuforc.org/webreports/162/S162819.html,\nA cone of light coming from the sky unlike a...,cone light coming sky unlike anything ever see...,,United States,Durham,NC
6,4/23/21 05:54,Light,5 minutes,Two bright lights one flashing with a descendi...,4/23/21,http://www.nuforc.org/webreports/162/S162820.html,\nTwo bright lights one flashing with a descen...,two bright light one flashing descending expan...,,United States,Parrish,FL
...,...,...,...,...,...,...,...,...,...,...,...,...
97243,,Disk,2 seconds,UFO disappears over cirrus formation.,1/11/19,http://www.nuforc.org/webreports/144/S144329.html,\nUFO disappears over cirrus formation.This mo...,ufo disappears cirrus formation.this morning a...,,United States,Littleton,CO
97244,,Disk,1 minute,Spinning craft ( counter clockwise) I have video,12/23/20,http://www.nuforc.org/webreports/160/S160854.html,\nSpinning craft ( counter clockwise) I have v...,spinning craft counter clockwise videoi short ...,,United States,Carson City,NV
97247,,Sphere,,A hovering sphere shaped object that appeared ...,4/8/19,http://www.nuforc.org/webreports/145/S145475.html,\nA hovering sphere shaped object that appeare...,hovering sphere shaped object appeared spinnin...,,United States,Gettysburg,PA
97248,,Light,5 minutes,Blinking lights weaving between trees in forest.,4/8/19,http://www.nuforc.org/webreports/145/S145479.html,\nBlinking lights weaving between trees in for...,blinking light weaving tree forest.on back dec...,,United States,Ricetown,KY


In [42]:
sightings_cleaned.loc[(sightings_cleaned.City=="St. Augustine")&(sightings_cleaned.State=="FL"), "City"] = "Saint Augustine"

In [43]:
sightings_cleaned.loc[28, ["State", "Country"]] = ["PA", "United States"]

In [44]:
for index, row in sightings_cleaned[(sightings_cleaned['City'].isin(cities_df["City alias"]))&(sightings_cleaned["Country"]=="United States")].iterrows():
    sightings_cleaned.loc[index, "City"] = cities_df.loc[cities_df['City alias'] == row.City, "City"].tolist()[0]

In [45]:
sightings_cleaned[(sightings_cleaned.Country.isnull())&(sightings_cleaned.Country.str.title().isin(countries.values()))]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State


In [46]:
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.City.str.title().isin(countries.values())), "Country"] = sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.City.str.title().isin(countries.values())), "City"]
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.City.str.title().isin(countries.values())), "City"] = ""

In [90]:
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&(sightings_cleaned.State.isnull()==False)&(sightings_cleaned.State!="")]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State,Coord,Location


In [48]:
sightings_cleaned.loc[90626, "Country"] = "United Kingdom"

In [49]:
sightings_cleaned.loc[(sightings_cleaned.Country.isnull())&
                      (sightings_cleaned.State.isnull()==False)&
                      (sightings_cleaned.State!=""), "Country"] = "Canada"

In [50]:
sightings_cleaned_other = sightings_cleaned[(sightings_cleaned.Country.isnull())|
                                         (sightings_cleaned.Country=="")|
                                         (sightings_cleaned.City.isnull())|
                                         (sightings_cleaned.City=="")].copy()
len(sightings_cleaned_other)
sightings_cleaned_other.to_pickle("sightings_cleaned_other.pkl")

In [51]:
len(sightings_cleaned)

67841

In [52]:
sightings_cleaned_bkup = sightings_cleaned.copy()
sightings_cleaned = sightings_cleaned[(sightings_cleaned.Country.isnull()==False)&
                                         (sightings_cleaned.Country!="")&
                                         (sightings_cleaned.City.isnull()==False)&
                                         (sightings_cleaned.City!="")].copy()
len(sightings_cleaned)

67305

In [55]:
sightings_cleaned["Location"] = sightings_cleaned.City + ", " + sightings_cleaned.State + " " + sightings_cleaned.Country

In [136]:
sightings_cleaned.to_pickle("sightings_cleaned.pkl")

In [61]:
len(sightings_cleaned["Location"].unique())

18190

Next, we need the Latitude and Longitude. Running through unique locations cuts the calls to the API from +90k to 18,190. This took runtime from 14 hours to less than 4.

In [133]:
nom=ArcGIS()
for l in sightings_cleaned["Location"].unique():
    coord = nom.geocode(l)
    sightings_cleaned.loc[sightings_cleaned.Location==l, 'Lat'] = coord.latitude
    sightings_cleaned.loc[sightings_cleaned.Location==l, 'Lon'] = coord.longitude

GeocoderUnavailable: HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Max retries exceeded with url: /arcgis/rest/services/World/GeocodeServer/findAddressCandidates?singleLine=Mc+cains+valley%2C+CA+United+States&f=json&maxLocations=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=1)",))

In [235]:
len(sightings_cleaned[sightings_cleaned.Lat.isnull()])

0

In [236]:
sightings_cleaned[sightings_cleaned.Lat.isnull()]

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State,Location,Lat,Lon


When the above loop errored, the next loop was created to process the remaining 489 records. Because of this set's smaller size, they were processed individually rather than grouped by "Location".

In [234]:
for index, row in sightings_cleaned[sightings_cleaned.Lat.isnull()].iterrows():
    try:
        l = row.Location
        coord = nom.geocode(l)
        sightings_cleaned.loc[sightings_cleaned.Location==l, 'Lat'] = coord.latitude
        sightings_cleaned.loc[sightings_cleaned.Location==l, 'Lon'] = coord.longitude
        print("Updated")
    except:
        print("Skipped")
        continue

Updated
Updated


In [138]:
sightings_cleaned.to_pickle("sightings_cleaned.pkl")

Only keep the United States at this point.

In [238]:
sightings_cleaned = sightings_cleaned[sightings_cleaned.Country=="United States"]

In [320]:
# sightings_cleaned = pd.read_pickle("sightings_cleaned_us.pkl")
sightings_cleaned.to_pickle("sightings_cleaned_us.pkl")

Final task for cleaning is to standardize the Date/Time to UTC.

In [270]:
sightings_cleaned['Date_Time'] = pd.to_datetime(sightings_cleaned['Date_Time'], format = '%m/%d/%y %H:%M', errors='coerce')

In [213]:
tzwhere = tzwhere.tzwhere()

In [279]:
sightings_cleaned.head(10)

Unnamed: 0,Date_Time,Shape,Duration,Summary,Posted,Detail_Link,Detail_Summary,Detail_Summary_nltk,Notes,Country,City,State,Location,Lat,Lon,Detail_Summary_nltk_spcheck
0,2021-04-23 06:30:00,Circle,9 minutes,Very strange ((NUFORC Note: Rocket launch f...,4/23/21,http://www.nuforc.org/webreports/162/S162815.html,\nVery strangeI have recorded a video of this ...,strangei recorded video sighting,,United States,Blackshear,GA,"Blackshear, GA United States",31.30443,-82.24058,strange recorded video fighting
1,2021-04-23 06:00:00,Circle,Seconds,Ball in the sky ((NUFORC Note: Rocket launc...,4/23/21,http://www.nuforc.org/webreports/162/S162814.html,\nBall in the skyObject appears as a white bal...,ball skyobject appears white ball vapor strewi...,,United States,Thompson,VA,"Thompson, VA United States",37.78198,-79.974385,ball skyobject appears white ball vapor screwi...
2,2021-04-23 06:00:00,Light,5 minutes,I was driving and saw something strange in the...,4/23/21,http://www.nuforc.org/webreports/162/S162822.html,\nI was driving and saw something strange in t...,driving saw something strange sky pulled car i...,,United States,Vero Beach,FL,"Vero Beach, FL United States",27.63885,-80.39396,driving saw something strange sky pulled car i...
3,2021-04-23 05:59:00,Light,3 minutes,2 extremely bright lights appeared over east c...,4/23/21,http://www.nuforc.org/webreports/162/S162824.html,\n2 extremely bright lights appeared over east...,2 extremely bright light appeared east coast n...,,United States,Saint Augustine,FL,"Saint Augustine, FL United States",29.89469,-81.31452,2 extremely bright light appeared east coast n...
4,2021-04-23 05:58:00,Cone,>5 minutes,A cone of light coming from the sky unlike any...,4/23/21,http://www.nuforc.org/webreports/162/S162819.html,\nA cone of light coming from the sky unlike a...,cone light coming sky unlike anything ever see...,,United States,Durham,NC,"Durham, NC United States",35.99542,-78.89644,cone light coming sky unlike anything ever see...
5,2021-04-23 05:55:00,Sphere,10 minutes,Noticed a intense light that was covering a la...,4/23/21,http://www.nuforc.org/webreports/162/S162823.html,\nDriving on I-16 south and noticed a intense ...,driving i16 south noticed intense light coveri...,,United States,I-16 south,GA,"I-16 south, GA United States",32.071949,-81.223154,driving 16 south noticed intense light coverin...
6,2021-04-23 05:54:00,Light,5 minutes,Two bright lights one flashing with a descendi...,4/23/21,http://www.nuforc.org/webreports/162/S162820.html,\nTwo bright lights one flashing with a descen...,two bright light one flashing descending expan...,,United States,Parrish,FL,"Parrish, FL United States",27.57762,-82.42546,two bright light one flashing descending expan...
7,2021-04-23 05:45:00,Light,~10-15 minutes,Im former military and have never seen aircraf...,4/23/21,http://www.nuforc.org/webreports/162/S162826.html,\nIm former military and have never seen aircr...,I former military never seen aircraft that.inc...,,United States,Davenport,FL,"Davenport, FL United States",28.15799,-81.6034,former military never seen aircraft thatincide...
9,2021-04-23 02:40:00,Chevron,3-4 seconds,"I witnessed a chevron-shaped object, silent an...",4/23/21,http://www.nuforc.org/webreports/162/S162827.html,"\nI witnessed a chevron-shaped object, silent ...","witnessed chevronshaped object , silent seven ...",,United States,Longmont,CO,"Longmont, CO United States",40.16394,-105.10022,witnessed charon shaped object silent seven li...
10,2021-04-22 22:23:00,Fireball,2 minutes,Saw a steady pulsating fireball above that mov...,4/23/21,http://www.nuforc.org/webreports/162/S162818.html,\nSaw a steady pulsating fireball above that m...,saw steady pulsating fireball moved slowly awa...,New York City (Brooklyn),United States,New York,NY,"New York, NY United States",40.71455,-74.00714,saw steady pulsating fireball moved slowly awa...


In [245]:
sightings_cleaned['timezone_str']  = sightings_cleaned.apply(lambda x: tzwhere.tzNameAt(x.Lat, x.Lon), axis = 1)

In [259]:
Time_UTC = []
for t,dt in zip(sightings_cleaned.timezone_str, sightings_cleaned.Date_Time): 
    try:
        local = pytz.timezone(t)
        naive = datetime.strptime(str(dt), "%Y-%m-%d %H:%M:%S")
        local_dt = local.localize(naive, is_dst=None)
        utc_dt = local_dt.astimezone(pytz.utc)
        Time_UTC.append(utc_dt)
    except:
        Time_UTC.append(None)

In [260]:
sightings_cleaned['Time_UTC'] = Time_UTC
sightings_cleaned['Time_UTC']

0       2021-04-23 10:30:00+00:00
1       2021-04-23 10:00:00+00:00
2       2021-04-23 10:00:00+00:00
3       2021-04-23 09:59:00+00:00
4       2021-04-23 09:58:00+00:00
                   ...           
97243                         NaT
97244                         NaT
97247                         NaT
97248                         NaT
97249                         NaT
Name: Time_UTC, Length: 61647, dtype: datetime64[ns, UTC]

In [280]:
#sightings_cleaned[sightings_cleaned['Time_UTC'].isnull()]
sightings_cleaned.loc[99]

Date_Time                                                                    NaT
Shape                                                                   Fireball
Duration                                                              10 seconds
Summary                        I happened to look up in the sky over Rocky Bu...
Posted                                                                   4/23/21
Detail_Link                    http://www.nuforc.org/webreports/162/S162725.html
Detail_Summary                 \nI happened to look up in the sky over Rocky ...
Detail_Summary_nltk            happened look sky rocky butte claim like 10 se...
Notes                                                                        NaN
Country                                                            United States
City                                                                    Portland
State                                                                         OR
Location                    

In [248]:
#correcting 1900s to 2000s issue with date column
def fix_date(x):
    if x.year > 2021:
        return datetime(x.year - 100, x.month, x.day, x.hour, x.minute, x.second, x.microsecond, tzinfo=timezone.utc)
    else:
        return x

In [249]:
sightings_cleaned['Time_UTC'] = sightings_cleaned['Time_UTC'].apply(fix_date)
sightings_cleaned['Time_UTC'] = pd.to_datetime(sightings_cleaned['Time_UTC'], utc=True)
sightings_cleaned.dtypes

AttributeError: 'NoneType' object has no attribute 'year'

In [None]:
sightings_cleaned.to_pickle("sightings_cleaned_us.pkl")