In [2]:
import pandas as pd
import numpy as np

columns = ['datetime', 'city', 'state', 'country', 'shape', 'duration_s', 'hours', 'comments', 'date_p', 'latitude', 'longitude']
df = pd.read_csv("completo.csv", header=0, names=columns, error_bad_lines=False, warn_bad_lines=False, low_memory=False)
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration_s,hours,comments,date_p,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [3]:
#dropando colunas que não serão utilizadas

df.drop('hours', inplace=True, axis=1)
df.drop('date_p', inplace=True, axis=1)

In [4]:
#transformando colunas que são números do tipo object em numéricas. Os valores que estiver errados, são transformandos em NaN.

df['duration_s'] = df['duration_s'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['latitude'] = df['latitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df['longitude'] = df['longitude'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [5]:
#dropando linhas que possuem algum valor NaN

df = df[df['latitude'].notna() & df['longitude'].notna() & df['duration_s'].notna()]

In [6]:
#separando date, e formatando para o formato datetime

df['datetime'] = df['datetime'].apply(lambda x: x.split(' '))
df['date'] = df['datetime'].apply(lambda x:x[0])
df['date'] =  pd.to_datetime(df['date'], format='%m/%d/%Y')
df.drop('datetime', axis=1, inplace=True)
df.head()

Unnamed: 0,city,state,country,shape,duration_s,comments,latitude,longitude,date
0,san marcos,tx,us,cylinder,2700.0,This event took place in early fall around 194...,29.883056,-97.941111,1949-10-10
1,lackland afb,tx,,light,7200.0,1949 Lackland AFB&#44 TX. Lights racing acros...,29.38421,-98.581082,1949-10-10
2,chester (uk/england),,gb,circle,20.0,Green/Orange circular disc over Chester&#44 En...,53.2,-2.916667,1955-10-10
3,edna,tx,us,circle,20.0,My older brother and twin sister were leaving ...,28.978333,-96.645833,1956-10-10
4,kaneohe,hi,us,light,900.0,AS a Marine 1st Lt. flying an FJ4B fighter/att...,21.418056,-157.803611,1960-10-10


In [7]:
df.dtypes

city                  object
state                 object
country               object
shape                 object
duration_s           float64
comments              object
latitude             float64
longitude            float64
date          datetime64[ns]
dtype: object

In [8]:
#verificando quais os shapes existentes

shapes = df['shape'].groupby(df['shape']).count()
shapes

shape
changed          1
changing      2140
chevron       1007
cigar         2241
circle        8452
cone           367
crescent         2
cross          265
cylinder      1382
delta            8
diamond       1308
disk          6005
dome             1
egg            845
fireball      6562
flare            1
flash         1472
formation     2656
hexagon          1
light        17872
other         6247
oval          4119
pyramid          1
rectangle     1416
round            2
sphere        5755
teardrop       817
triangle      8489
unknown       6319
Name: shape, dtype: int64

In [9]:
#agrupando descrição de formas dos UFOs as quais julgamos serem pertinentes a uma única classe

df['shape'] = df['shape'].replace(to_replace ='changed', value = 'changing')
df['shape'] = df['shape'].replace(to_replace ='other', value = 'unknown')
df['shape'] = df['shape'].replace(to_replace ='delta', value = 'triangle')
df['shape'] = df['shape'].replace(to_replace ='crescent', value = 'unknown')
df['shape'] = df['shape'].replace(to_replace ='round', value = 'circle')
df['shape'] = df['shape'].replace(to_replace ='egg', value = 'oval')
df['shape'] = df['shape'].replace(to_replace ='flare', value = 'light')

In [10]:
#transformando a coluna duration_s de segundos para minutos

df['duration_s'] = df['duration_s'].apply(lambda x: x/60)
df.rename(columns={'duration_s': 'duration_m'}, inplace=True)

In [11]:
df.head()

Unnamed: 0,city,state,country,shape,duration_m,comments,latitude,longitude,date
0,san marcos,tx,us,cylinder,45.0,This event took place in early fall around 194...,29.883056,-97.941111,1949-10-10
1,lackland afb,tx,,light,120.0,1949 Lackland AFB&#44 TX. Lights racing acros...,29.38421,-98.581082,1949-10-10
2,chester (uk/england),,gb,circle,0.333333,Green/Orange circular disc over Chester&#44 En...,53.2,-2.916667,1955-10-10
3,edna,tx,us,circle,0.333333,My older brother and twin sister were leaving ...,28.978333,-96.645833,1956-10-10
4,kaneohe,hi,us,light,15.0,AS a Marine 1st Lt. flying an FJ4B fighter/att...,21.418056,-157.803611,1960-10-10


In [12]:
#exportando o df pre processado para um csv, para posterior utilização

df.to_csv('pre_processado_completo.csv', index=None)