# Pré processamento

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

pd.options.mode.chained_assignment = None

In [2]:
#Carregando dataset original

columns = ['datetime', 'city', 'state', 'country', 'shape', 'duration_s', 'hours', 'comments', 'date_p', 'latitude', 'longitude']
df = pd.read_csv("completo.csv", header=0, names=columns, error_bad_lines=False, warn_bad_lines=False, low_memory=False)
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration_s,hours,comments,date_p,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [3]:
#Transformando a coluna latitude e longitude no tipo numérica. Os valores que estiver errados, são transformandos em NaN.

df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

## Tratamento country = NaN

In [4]:
#separando o dataframe original em dois dataframes
#um contém os registros que possuem a coluna 'country' NaN
#o outro contém os registro que possuem a coluna 'country' preenchida

df_countrys_na = df[df['country'].isna()]
df = df[df['country'].notna()]

In [5]:
#Iniciando objetos geopy para preencher valores NaN da coluna 'Country' utilizando a latitude e longitude

geolocator = Nominatim(user_agent="geoapiExercises")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)

In [6]:
#formatando latitude e longitude para serem utilizadas no Geopy

df_lat_lon = df_countrys_na[['latitude', 'longitude']]

for index, row in df_lat_lon.iterrows():
    df_lat_lon.at[index, 'location'] = str(row[0]) + ', ' + str(row[1])

df_lat_lon.drop('latitude', axis=1, inplace=True)
df_lat_lon.drop('longitude', axis=1, inplace=True)
df_lat_lon.to_csv('c_lat_lon.csv')

df_lat_lon

Unnamed: 0,location
1,"29.384209999999996, -98.581082"
18,"32.364167, -64.678611"
19,"0.0, 0.0"
30,"53.970571, -111.689885"
36,"-38.662334, 178.017649"
...,...
88623,"0.0, 0.0"
88659,"39.078889000000004, -78.427222"
88663,"40.858433000000005, -74.16375500000001"
88666,"50.465843, 22.891814"


In [25]:
chunker = pd.read_csv('c_lat_lon.csv', header=0, error_bad_lines=False, warn_bad_lines=False, low_memory=False, chunksize=250)


In [27]:
#aplicando o Geopy para obter os países através das latitudes e longitudes

for piece in chunker:
    
    address = [reverse(loc) for loc in piece['location']]
    
    c = []
    for i in range(len(address)):
        try:
            c.append(address[i].raw['address']['country_code'])
        except:
            c.append('NULL')
    
    piece['country'] = c
    piece.to_csv('c_pieces.csv', mode = 'a', header = False)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('34.132263, -118.456969',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\c

RateLimiter caught an error, retrying (1/2 tries). Called with (*('0.0, 0.0',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.

RateLimiter caught an error, retrying (0/2 tries). Called with (*('-36.848459999999996, 174.76333200000002',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-p

RateLimiter caught an error, retrying (0/2 tries). Called with (*('47.085702000000005, -122.706166',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\

RateLimiter caught an error, retrying (0/2 tries). Called with (*('9.748917, -83.753428',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\con

RateLimiter caught an error, retrying (0/2 tries). Called with (*('39.505468, -119.69268600000001',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\u

RateLimiter caught an error, retrying (0/2 tries). Called with (*('40.249252, -77.137011',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\co

RateLimiter caught an error, retrying (0/2 tries). Called with (*('24.083333, 38.0',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connecti

RateLimiter caught an error, retrying (0/2 tries). Called with (*('20.023056, -155.67166699999999',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\u

RateLimiter caught an error, retrying (0/2 tries). Called with (*('38.780114000000005, -77.38665300000001',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-pa

RateLimiter caught an error, retrying (0/2 tries). Called with (*('37.99279, -87.507465',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\con

RateLimiter caught an error, retrying (0/2 tries). Called with (*('28.681982, -81.249713',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\co

RateLimiter caught an error, retrying (0/2 tries). Called with (*('0.0, 0.0',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.

RateLimiter caught an error, retrying (0/2 tries). Called with (*('36.777, -88.06',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectio

RateLimiter caught an error, retrying (0/2 tries). Called with (*('0.0, 0.0',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.

RateLimiter caught an error, retrying (0/2 tries). Called with (*('33.93911, 67.709953',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
    raise err
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
    sock.connect(sa)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\conn

RateLimiter caught an error, retrying (0/2 tries). Called with (*('33.759271000000005, -96.66944000000001',), **{}).
Traceback (most recent call last):
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 978, in _validate_conn
    conn.connect()
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\connection.py", line 362, in connect
    self.sock = ssl_wrap_socket(
  File "C:\Users\rober\anaconda3\lib\site-packages\urllib3\util\ssl_.py", line 386, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\rober\anaconda3\lib\ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
  File "C:\Users\rober\anaconda3\lib\ssl.py", line 1040, in _create
    self.do_handshake()
  File "C:\Users\rober\anaconda3\lib\ssl.py", line 1309, in do_handshake
    sel

In [7]:
#formatando os resultados do Geopy

df_geopy = pd.read_csv('c_pieces.csv', header=None)
df_geopy.rename(columns={1:'index', 2:'lat_lon', 3: 'country'}, inplace=True)
df_geopy.set_index('index', inplace=True)
df_geopy.drop(0, axis=1, inplace=True)
df_geopy

Unnamed: 0_level_0,lat_lon,country
index,Unnamed: 1_level_1,Unnamed: 2_level_1
7892,"40.165393, -75.282905",us
7903,"38.485293, -78.951451",us
7922,"33.967338, -94.168531",us
7941,"40.417287, -82.907123",us
7943,"0.0, 0.0",
...,...,...
88623,"0.0, 0.0",
88659,"39.078889000000004, -78.427222",us
88663,"40.858433000000005, -74.16375500000001",us
88666,"50.465843, 22.891814",pl


In [8]:
#preenchendo a coluna "Country" com os resultados do Geopy
#o modulo Geopy não foi capaz de determinar o país para todas as entradas (em torno de 12 mil),
#conseguindo apenas em torno de 9 mil, mas ainda assim agregando mais informações ao dataset

df_countrys_na['country'] = df_geopy['country']
df_countrys_na = df_countrys_na[df_countrys_na['country'].notna()]
df_countrys_na

Unnamed: 0,datetime,city,state,country,shape,duration_s,hours,comments,date_p,latitude,longitude
7892,10/8/2011 21:00,whitpain township,pa,us,formation,60,1 minute,Big Dipper formation of orange objects.,10/10/2011,40.165393,-75.282905
7903,10/8/2011 23:00,central,va,us,triangle,0,varies,Single bright light giving way to strange flic...,10/10/2011,38.485293,-78.951451
7922,10/8/2013 02:00,lockesberg,ar,us,circle,45,00:45,Small round what light (dim sometimes). Subtl...,10/14/2013,33.967338,-94.168531
7941,10/8/2013 21:00,ohio (location not specified),oh,us,fireball,180,3 minutes,5 fireballs. ((NUFORC Note: Location of sigh...,10/14/2013,40.417287,-82.907123
7949,10/9/1988 21:00,bridgend,,gb,disk,900,15 mins,The experience was not frightening&#44 it was ...,2/18/2001,51.504286,-3.576945
...,...,...,...,...,...,...,...,...,...,...,...
88580,9/9/2009 20:17,lyman,me,us,light,600,10 mins,Two lights ran across the sky&#44 as bright as...,12/12/2009,43.505096,-70.637968
88659,9/9/2013 12:00,star tannery,va,us,unknown,0,unk,Object seen in photo after it was taken and ph...,10/3/2013,39.078889,-78.427222
88663,9/9/2013 20:15,clifton,nj,us,other,3600,~1hr+,Luminous line seen in New Jersey sky.,9/30/2013,40.858433,-74.163755
88666,9/9/2013 21:00,aleksandrow (poland),,pl,light,15,15 seconds,Two points of light following one another in a...,9/30/2013,50.465843,22.891814


## Unindo os resultados do Geopy ao df original

In [9]:
df = pd.concat([df, df_countrys_na])
df

Unnamed: 0,datetime,city,state,country,shape,duration_s,hours,comments,date_p,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.200000,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611
5,10/10/1961 19:00,bristol,tn,us,sphere,300,5 minutes,My father is now 89 my brother 52 the girl wit...,4/27/2007,36.595000,-82.188889
...,...,...,...,...,...,...,...,...,...,...,...
88580,9/9/2009 20:17,lyman,me,us,light,600,10 mins,Two lights ran across the sky&#44 as bright as...,12/12/2009,43.505096,-70.637968
88659,9/9/2013 12:00,star tannery,va,us,unknown,0,unk,Object seen in photo after it was taken and ph...,10/3/2013,39.078889,-78.427222
88663,9/9/2013 20:15,clifton,nj,us,other,3600,~1hr+,Luminous line seen in New Jersey sky.,9/30/2013,40.858433,-74.163755
88666,9/9/2013 21:00,aleksandrow (poland),,pl,light,15,15 seconds,Two points of light following one another in a...,9/30/2013,50.465843,22.891814


In [10]:
#dropando colunas que não serão utilizadas

df.drop('hours', inplace=True, axis=1)
df.drop('date_p', inplace=True, axis=1)

In [11]:
#transformando a coluna duration_s do tipo object para numérica. Os valores que estiver errados, são transformandos em NaN.

df['duration_s'] = df['duration_s'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [12]:
#dropando linhas que possuem algum valor NaN nas colunas Latitude, Longitude ou duration_s

df = df[df['latitude'].notna() & df['longitude'].notna() & df['duration_s'].notna()]

In [13]:
#separando date, e formatando para o formato datetime

df['datetime'] = df['datetime'].apply(lambda x: x.split(' '))
df['date'] = df['datetime'].apply(lambda x:x[0])
df['date'] =  pd.to_datetime(df['date'], format='%m/%d/%Y')
df.drop('datetime', axis=1, inplace=True)
df.head()

Unnamed: 0,city,state,country,shape,duration_s,comments,latitude,longitude,date
0,san marcos,tx,us,cylinder,2700.0,This event took place in early fall around 194...,29.883056,-97.941111,1949-10-10
2,chester (uk/england),,gb,circle,20.0,Green/Orange circular disc over Chester&#44 En...,53.2,-2.916667,1955-10-10
3,edna,tx,us,circle,20.0,My older brother and twin sister were leaving ...,28.978333,-96.645833,1956-10-10
4,kaneohe,hi,us,light,900.0,AS a Marine 1st Lt. flying an FJ4B fighter/att...,21.418056,-157.803611,1960-10-10
5,bristol,tn,us,sphere,300.0,My father is now 89 my brother 52 the girl wit...,36.595,-82.188889,1961-10-10


In [14]:
df.dtypes

city                  object
state                 object
country               object
shape                 object
duration_s           float64
comments              object
latitude             float64
longitude            float64
date          datetime64[ns]
dtype: object

In [15]:
#verificando quais os shapes existentes

shapes = df['shape'].groupby(df['shape']).count()
shapes

shape
changed          1
changing      2079
chevron        974
cigar         2162
circle        8183
cone           356
crescent         2
cross          260
cylinder      1335
delta            8
diamond       1269
disk          5733
dome             1
egg            814
fireball      6395
flare            1
flash         1430
formation     2558
hexagon          1
light        17315
other         6049
oval          3991
pyramid          1
rectangle     1365
round            2
sphere        5548
teardrop       795
triangle      8274
unknown       6123
Name: shape, dtype: int64

In [16]:
#agrupando descrição de formas dos UFOs as quais julgamos serem pertinentes a uma única classe

df['shape'] = df['shape'].replace(to_replace ='changed', value = 'changing')
df['shape'] = df['shape'].replace(to_replace ='other', value = 'unknown')
df['shape'] = df['shape'].replace(to_replace ='delta', value = 'triangle')
df['shape'] = df['shape'].replace(to_replace ='crescent', value = 'unknown')
df['shape'] = df['shape'].replace(to_replace ='round', value = 'circle')
df['shape'] = df['shape'].replace(to_replace ='egg', value = 'oval')
df['shape'] = df['shape'].replace(to_replace ='flare', value = 'light')

In [17]:
#transformando a coluna duration_s de segundos para minutos

df['duration_s'] = df['duration_s'].apply(lambda x: x/60)
df.rename(columns={'duration_s': 'duration_m'}, inplace=True)

In [18]:
df.head()

Unnamed: 0,city,state,country,shape,duration_m,comments,latitude,longitude,date
0,san marcos,tx,us,cylinder,45.0,This event took place in early fall around 194...,29.883056,-97.941111,1949-10-10
2,chester (uk/england),,gb,circle,0.333333,Green/Orange circular disc over Chester&#44 En...,53.2,-2.916667,1955-10-10
3,edna,tx,us,circle,0.333333,My older brother and twin sister were leaving ...,28.978333,-96.645833,1956-10-10
4,kaneohe,hi,us,light,15.0,AS a Marine 1st Lt. flying an FJ4B fighter/att...,21.418056,-157.803611,1960-10-10
5,bristol,tn,us,sphere,5.0,My father is now 89 my brother 52 the girl wit...,36.595,-82.188889,1961-10-10


In [19]:
#exportando o df pre processado para um csv, para posterior utilização

df.to_csv('pre_processado_completo.csv', index=None)