# Tratamento de dados

## - Neste Notebook foram extraindos dados   existentes na coluna informações.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
url = 'https://raw.githubusercontent.com/robertferro/carros/main/1%20-%20Web%20scraping/dados_carros_ml_ok.csv'

In [3]:
df = pd.read_csv(url, sep = ',')

In [4]:
df.head()

Unnamed: 0,preco,quilometragem,ano,infomacoes
0,16.99,104.000 Km,2008,Citroën Xsara Picasso 1.6 Glx Flex 5p
1,43.9,77.000 Km,2018,Citroën Aircross Aircross Start 1.6 Flex 16v 5...
2,163.9,0 Km,2021,Citroën Jumpy Minibus
3,34.99,65.000 Km,2013,Citroën C3 1.6 Vti 16v Exclusive Flex Aut. 5p
4,129.9,0 Km,2021,Citroën Jumpy Furgão


In [5]:
df.shape

(6384, 4)

### Reorganizando os dados da coluna "informacoes"

In [6]:
marca = df['infomacoes'].str.split(" ", n = 1, expand = True)
df['marca'] = marca[0]
df['infomacoes'] = marca[1]

In [7]:
df.head()

Unnamed: 0,preco,quilometragem,ano,infomacoes,marca
0,16.99,104.000 Km,2008,Xsara Picasso 1.6 Glx Flex 5p,Citroën
1,43.9,77.000 Km,2018,Aircross Aircross Start 1.6 Flex 16v 5p Mec.,Citroën
2,163.9,0 Km,2021,Jumpy Minibus,Citroën
3,34.99,65.000 Km,2013,C3 1.6 Vti 16v Exclusive Flex Aut. 5p,Citroën
4,129.9,0 Km,2021,Jumpy Furgão,Citroën


  - Renomeando as marcas

In [8]:
dic_marcas = dict(df.marca.value_counts())

In [9]:
new_dict = {'Mercedes-benz': 'mercedes_benz','Jeep': 'jeep','Honda': 'honda',
 'Volkswagen': 'volkswagem', 'Chevrolet': 'chevrolet','Nissan': 'nissan',
 'Audi': 'audi','Renault': 'renault',
 'Citroën': 'citroen','Ford': 'ford','Toyota':'toyota',
 'Peugeot': 'peugeot','Hyundai': 'hyndai',
 'Mitsubishi': 'mitsubishi','Land': 'land_rover',
 'Fiat': 'fiat', 'Kia': 'kia','Suzuki': 'suzuki',
 'Bmw': 'bmw',
'Mini': 'nan','Novo': 'nan','I': 'nan','Isuzu': 'nan','Jaguar': 'nan',
 'Gm':'chevrolet','Rover': 'land_rover'}

In [10]:
df['marca'] = df['marca'].map(new_dict)

In [11]:
df.marca.value_counts()

mercedes_benz    339
jeep             339
honda            338
volkswagem       338
chevrolet        338
hyndai           336
renault          336
ford             336
audi             336
peugeot          336
land_rover       336
toyota           336
mitsubishi       336
nissan           336
citroen          336
fiat             335
kia              333
suzuki           332
bmw              324
nan                8
Name: marca, dtype: int64

In [12]:
df = df.query('marca != "nan"')

In [13]:
df.marca.value_counts()

mercedes_benz    339
jeep             339
chevrolet        338
honda            338
volkswagem       338
nissan           336
toyota           336
hyndai           336
renault          336
ford             336
audi             336
peugeot          336
land_rover       336
citroen          336
mitsubishi       336
fiat             335
kia              333
suzuki           332
bmw              324
Name: marca, dtype: int64

 - Criando a coluna modelo

In [14]:
df = df.reset_index()

In [15]:
df = df.drop('index', axis = 1)

In [16]:
df.head()

Unnamed: 0,preco,quilometragem,ano,infomacoes,marca
0,16.99,104.000 Km,2008,Xsara Picasso 1.6 Glx Flex 5p,citroen
1,43.9,77.000 Km,2018,Aircross Aircross Start 1.6 Flex 16v 5p Mec.,citroen
2,163.9,0 Km,2021,Jumpy Minibus,citroen
3,34.99,65.000 Km,2013,C3 1.6 Vti 16v Exclusive Flex Aut. 5p,citroen
4,129.9,0 Km,2021,Jumpy Furgão,citroen


In [17]:
modelo = df['infomacoes'].str.split(" ", n = 1 , expand = True)
df['modelo'] = modelo[0]
df['informacoes'] = modelo[1]

In [18]:
df = df.drop('infomacoes', axis = 1)

In [19]:
df.head()

Unnamed: 0,preco,quilometragem,ano,marca,modelo,informacoes
0,16.99,104.000 Km,2008,citroen,Xsara,Picasso 1.6 Glx Flex 5p
1,43.9,77.000 Km,2018,citroen,Aircross,Aircross Start 1.6 Flex 16v 5p Mec.
2,163.9,0 Km,2021,citroen,Jumpy,Minibus
3,34.99,65.000 Km,2013,citroen,C3,1.6 Vti 16v Exclusive Flex Aut. 5p
4,129.9,0 Km,2021,citroen,Jumpy,Furgão


 - Tratando a coluna quilometragem.

In [20]:
km = df['quilometragem'].str.split(" ", n = 1, expand = True)

In [21]:
df['quilometragem'] = km[0]

In [22]:
df.head()

Unnamed: 0,preco,quilometragem,ano,marca,modelo,informacoes
0,16.99,104.0,2008,citroen,Xsara,Picasso 1.6 Glx Flex 5p
1,43.9,77.0,2018,citroen,Aircross,Aircross Start 1.6 Flex 16v 5p Mec.
2,163.9,0.0,2021,citroen,Jumpy,Minibus
3,34.99,65.0,2013,citroen,C3,1.6 Vti 16v Exclusive Flex Aut. 5p
4,129.9,0.0,2021,citroen,Jumpy,Furgão


In [23]:
df['preco'] = df['preco'].str.replace('.','')
df['quilometragem'] = df['quilometragem'].str.replace('.','')

In [24]:
df.head()

Unnamed: 0,preco,quilometragem,ano,marca,modelo,informacoes
0,16990,104000,2008,citroen,Xsara,Picasso 1.6 Glx Flex 5p
1,43900,77000,2018,citroen,Aircross,Aircross Start 1.6 Flex 16v 5p Mec.
2,163900,0,2021,citroen,Jumpy,Minibus
3,34990,65000,2013,citroen,C3,1.6 Vti 16v Exclusive Flex Aut. 5p
4,129900,0,2021,citroen,Jumpy,Furgão


 - Criando novo arquivo csv com os dados pré-processados.

In [25]:
df.to_csv('carros_ml_pre_processados.csv',index =False , encoding = 'utf-8')