## Pré-processamento dos dados 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('meteorite-landings.csv')

In [3]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775000, 6.083330)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.216670, -113.000000)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.883330, -99.900000)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.166670, -64.950000)"


### Processamento do atributo "year"

In [4]:
df['year'].dtype

dtype('float64')

In [6]:
df['year'] = df['year'].astype(pd.Int64Dtype())

In [7]:
df['year'].dtype

Int64Dtype()

Ordenando os dados pelo ano

In [9]:
df.sort_values(by=['year'],inplace=True)

In [10]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
16356,Havana,11857,Valid,"Iron, IAB complex",,Found,301,40.33333,-90.05,"(40.333330, -90.050000)"
38301,Wietrzno-Bobrka,24259,Valid,Iron,376.0,Found,601,49.41667,21.7,"(49.416670, 21.700000)"
703,Nogata,16988,Valid,L6,472.0,Fell,860,33.725,130.75,"(33.725000, 130.750000)"
678,Narni,16914,Valid,Stone-uncl,,Fell,920,42.51667,12.51667,"(42.516670, 12.516670)"
278,Elbogen,7823,Valid,"Iron, IID",107000.0,Fell,1399,50.18333,12.73333,"(50.183330, 12.733330)"


In [11]:
df[df['year']>2016]

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
30679,Northwest Africa 7701,57150,Valid,CK6,55.0,Found,2101,0.0,0.0,"(0.000000, 0.000000)"
38188,Ur,24125,Valid,Iron,,Found,2501,30.9,46.01667,"(30.900000, 46.016670)"


In [13]:
df.drop(df[df['year']>2016].index, inplace=True)

### Análise de nulos

In [15]:
df.isnull().any()

name           False
id             False
nametype       False
recclass       False
mass            True
fall           False
year            True
reclat          True
reclong         True
GeoLocation     True
dtype: bool

In [16]:
df.isnull().sum()

name              0
id                0
nametype          0
recclass          0
mass            130
fall              0
year            288
reclat         7315
reclong        7315
GeoLocation    7315
dtype: int64

In [17]:
df.count()

name           45714
id             45714
nametype       45714
recclass       45714
mass           45584
fall           45714
year           45426
reclat         38399
reclong        38399
GeoLocation    38399
dtype: int64

In [19]:
df.dropna(inplace=True)

In [20]:
df.isnull().sum()

name           0
id             0
nametype       0
recclass       0
mass           0
fall           0
year           0
reclat         0
reclong        0
GeoLocation    0
dtype: int64

### Análise do atributo "mass"

In [22]:
df[df['mass']==0].count()

name           19
id             19
nametype       19
recclass       19
mass           19
fall           19
year           19
reclat         19
reclong        19
GeoLocation    19
dtype: int64

In [23]:
df.drop(df[df['mass']==0].index, inplace=True)

In [25]:
df[df['mass']==0].count()

name           0
id             0
nametype       0
recclass       0
mass           0
fall           0
year           0
reclat         0
reclong        0
GeoLocation    0
dtype: int64

### Análise das coordenadas "GeoLocation"

In [27]:
print(df[df['GeoLocation']=='(0.000000, 0.000000)'].count())
df[df['GeoLocation']=='(0.000000, 0.000000)']

name           6184
id             6184
nametype       6184
recclass       6184
mass           6184
fall           6184
year           6184
reclat         6184
reclong        6184
GeoLocation    6184
dtype: int64


Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
26374,New York,55760,Valid,"Iron, IIIAB",2950.00,Found,1965,0.0,0.0,"(0.000000, 0.000000)"
44893,Yamato 983064,39686,Valid,L6,40.94,Found,1998,0.0,0.0,"(0.000000, 0.000000)"
44892,Yamato 983063,39685,Valid,LL3,3.38,Found,1998,0.0,0.0,"(0.000000, 0.000000)"
44891,Yamato 983056,39678,Valid,H5,362.40,Found,1998,0.0,0.0,"(0.000000, 0.000000)"
45095,Yamato 983366,39988,Valid,Eucrite-unbr,138.90,Found,1998,0.0,0.0,"(0.000000, 0.000000)"
...,...,...,...,...,...,...,...,...,...,...
30773,Northwest Africa 7857,57422,Valid,LL6,246.00,Found,2013,0.0,0.0,"(0.000000, 0.000000)"
30774,Northwest Africa 7858,57423,Valid,H4,459.00,Found,2013,0.0,0.0,"(0.000000, 0.000000)"
30776,Northwest Africa 7861,57425,Valid,L5,611.00,Found,2013,0.0,0.0,"(0.000000, 0.000000)"
30777,Northwest Africa 7862,57426,Valid,L4/5,317.00,Found,2013,0.0,0.0,"(0.000000, 0.000000)"


In [28]:
df.drop(df[df['GeoLocation']=='(0.000000, 0.000000)'].index, inplace=True)

In [31]:
print(df[df['GeoLocation']=='(0.000000, 0.000000)'].count())
df[df['GeoLocation']=='(0.000000, 0.000000)']

name           0
id             0
nametype       0
recclass       0
mass           0
fall           0
year           0
reclat         0
reclong        0
GeoLocation    0
dtype: int64


Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation


### Reajutando os indices

In [32]:
df.reset_index(inplace=True)

In [34]:
df.index

RangeIndex(start=0, stop=31912, step=1)

In [36]:
del df['index']

In [37]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Wietrzno-Bobrka,24259,Valid,Iron,376.0,Found,601,49.41667,21.7,"(49.416670, 21.700000)"
1,Nogata,16988,Valid,L6,472.0,Fell,860,33.725,130.75,"(33.725000, 130.750000)"
2,Elbogen,7823,Valid,"Iron, IID",107000.0,Fell,1399,50.18333,12.73333,"(50.183330, 12.733330)"
3,Rivolta de Bassi,22614,Valid,Stone-uncl,103.3,Fell,1490,45.48333,9.51667,"(45.483330, 9.516670)"
4,Ensisheim,10039,Valid,LL6,127000.0,Fell,1491,47.86667,7.35,"(47.866670, 7.350000)"


### Criação da nova base

In [39]:
df.to_csv('meteorite-landings-v2.csv')