# Tratamento de dados

## - Neste Notebook foram extraídos dados  existentes na coluna informações.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
url='https://raw.githubusercontent.com/robertferro/carros_populares/main/1%20-%20Web%20scraping/carros_populares.csv'

In [3]:
df = pd.read_csv(url, sep = ',')

In [4]:
df.head()

Unnamed: 0,modelo,preco,quilometragem,ano,infomacoes
0,chevrolet/agile/,25.99,91.000 Km,2011,Chevrolet Agile 1.4 Lt 5p
1,chevrolet/agile/,23.499,78.400 Km,2011,Chevrolet Agile 1.4 Ltz 5p
2,chevrolet/agile/,28.9,123.915 Km,2013,Chevrolet Agile 1.4 Ltz 5p
3,chevrolet/agile/,28.0,91.000 Km,2013,Chevrolet Agile 1.4 Lt 5p
4,chevrolet/agile/,31.99,106.863 Km,2014,Chevrolet Agile Agile 1.4 Ltz 8v Flex 4p Manual


In [5]:
df.shape

(12192, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12192 entries, 0 to 12191
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   modelo         12192 non-null  object
 1   preco          12192 non-null  object
 2   quilometragem  12192 non-null  object
 3   ano            12192 non-null  int64 
 4   infomacoes     12192 non-null  object
dtypes: int64(1), object(4)
memory usage: 476.4+ KB


### Separando marca e modelo.

In [7]:
marca = df['modelo'].str.split("/",n=2, expand=True)

In [8]:
df['marca'] = marca[0]
df['modelo'] = marca[1]

In [9]:
df.marca.value_counts()

fiat          1824
chevrolet     1632
volkswagen    1632
renault        864
ford           864
citroen        864
peugeot        864
mitsubishi     768
hyundai        672
nissan         672
honda          576
toyota         576
kia            384
Name: marca, dtype: int64

In [10]:
df.modelo.value_counts()

linea         96
C4-Picasso    96
fluence       96
polo-Sedan    96
207           96
              ..
bravo         96
argo          96
astra         96
vectra        96
etios         96
Name: modelo, Length: 127, dtype: int64

### Reorganizando os dados da coluna "informacoes"

In [11]:
informacoes = df['infomacoes'].str.split(" ", n = 1, expand = True)
df['informacoes'] = informacoes[1]

In [12]:
df = df.drop('infomacoes', axis = 1)

In [13]:
df.head()

Unnamed: 0,modelo,preco,quilometragem,ano,marca,informacoes
0,agile,25.99,91.000 Km,2011,chevrolet,Agile 1.4 Lt 5p
1,agile,23.499,78.400 Km,2011,chevrolet,Agile 1.4 Ltz 5p
2,agile,28.9,123.915 Km,2013,chevrolet,Agile 1.4 Ltz 5p
3,agile,28.0,91.000 Km,2013,chevrolet,Agile 1.4 Lt 5p
4,agile,31.99,106.863 Km,2014,chevrolet,Agile Agile 1.4 Ltz 8v Flex 4p Manual


In [14]:
df.marca.value_counts().tail(33)

fiat          1824
chevrolet     1632
volkswagen    1632
renault        864
ford           864
citroen        864
peugeot        864
mitsubishi     768
hyundai        672
nissan         672
honda          576
toyota         576
kia            384
Name: marca, dtype: int64

In [None]:
# pd.set_option('display.max_rows', 1001)
# df.head(1001)

 - Tratando a coluna quilometragem.

In [16]:
km = df['quilometragem'].str.split(" ", n = 1, expand = True)

In [17]:
df['quilometragem'] = km[0]

In [18]:
df['preco'] = df['preco'].str.replace('.','')
df['quilometragem'] = df['quilometragem'].str.replace('.','')

In [21]:
df['quilometragem'] = df['quilometragem'].str.replace(',','0')
df['quilometragem'] = df['quilometragem'].astype(float)

In [22]:
df.head()

Unnamed: 0,modelo,preco,quilometragem,ano,marca,informacoes
0,agile,25990,91000.0,2011,chevrolet,Agile 1.4 Lt 5p
1,agile,23499,78400.0,2011,chevrolet,Agile 1.4 Ltz 5p
2,agile,28900,123915.0,2013,chevrolet,Agile 1.4 Ltz 5p
3,agile,28000,91000.0,2013,chevrolet,Agile 1.4 Lt 5p
4,agile,31990,106863.0,2014,chevrolet,Agile Agile 1.4 Ltz 8v Flex 4p Manual


## Criando a coluna motor.

 - Funçao para pegar apenas a potência do motor.

In [23]:
def pegar_potencia_do_motor(entrada):
    entrada = entrada.split()
    
    nova_entrada = []
    for num in  entrada :
        if len(num) == 3 and '.' in num:
            nova_entrada.append(num)
            
            
    if  len(nova_entrada) == 0:
         motor = np.nan
    else:
        motor = nova_entrada
        motor = motor[0]
    
    return motor

In [24]:
df['motor'] = df['informacoes'].apply(pegar_potencia_do_motor)
df.head()

Unnamed: 0,modelo,preco,quilometragem,ano,marca,informacoes,motor
0,agile,25990,91000.0,2011,chevrolet,Agile 1.4 Lt 5p,1.4
1,agile,23499,78400.0,2011,chevrolet,Agile 1.4 Ltz 5p,1.4
2,agile,28900,123915.0,2013,chevrolet,Agile 1.4 Ltz 5p,1.4
3,agile,28000,91000.0,2013,chevrolet,Agile 1.4 Lt 5p,1.4
4,agile,31990,106863.0,2014,chevrolet,Agile Agile 1.4 Ltz 8v Flex 4p Manual,1.4


In [25]:
df.motor.value_counts()

1.6    2819
2.0    2247
1.0    1937
1.8    1255
1.4    1188
1.5     540
2.4     201
2.5     180
2.3     178
2.8     176
1.3     149
3.2     135
3.0     130
3.5      64
2.2      47
1.2      46
2.7      44
3.3      39
3.8      38
1.9      29
1.7      12
4.0       5
1.1       3
2.o       2
(n.       1
Name: motor, dtype: int64

In [26]:
df['motor'] = df['motor'].str.replace('o','0')

In [27]:
df.motor.value_counts()

1.6    2819
2.0    2249
1.0    1937
1.8    1255
1.4    1188
1.5     540
2.4     201
2.5     180
2.3     178
2.8     176
1.3     149
3.2     135
3.0     130
3.5      64
2.2      47
1.2      46
2.7      44
3.3      39
3.8      38
1.9      29
1.7      12
4.0       5
1.1       3
(n.       1
Name: motor, dtype: int64

In [28]:
df = df.query('motor != "(n."')

In [29]:
df.motor.value_counts()

1.6    2819
2.0    2249
1.0    1937
1.8    1255
1.4    1188
1.5     540
2.4     201
2.5     180
2.3     178
2.8     176
1.3     149
3.2     135
3.0     130
3.5      64
2.2      47
1.2      46
2.7      44
3.3      39
3.8      38
1.9      29
1.7      12
4.0       5
1.1       3
Name: motor, dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12191 entries, 0 to 12191
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   modelo         12191 non-null  object 
 1   preco          12191 non-null  object 
 2   quilometragem  12191 non-null  float64
 3   ano            12191 non-null  int64  
 4   marca          12191 non-null  object 
 5   informacoes    12191 non-null  object 
 6   motor          11464 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 761.9+ KB


In [31]:
df['preco'] = df['preco'].astype(float)
df['motor'] = df['motor'].astype(float)

In [32]:
df.head()

Unnamed: 0,modelo,preco,quilometragem,ano,marca,informacoes,motor
0,agile,25990.0,91000.0,2011,chevrolet,Agile 1.4 Lt 5p,1.4
1,agile,23499.0,78400.0,2011,chevrolet,Agile 1.4 Ltz 5p,1.4
2,agile,28900.0,123915.0,2013,chevrolet,Agile 1.4 Ltz 5p,1.4
3,agile,28000.0,91000.0,2013,chevrolet,Agile 1.4 Lt 5p,1.4
4,agile,31990.0,106863.0,2014,chevrolet,Agile Agile 1.4 Ltz 8v Flex 4p Manual,1.4


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12191 entries, 0 to 12191
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   modelo         12191 non-null  object 
 1   preco          12191 non-null  float64
 2   quilometragem  12191 non-null  float64
 3   ano            12191 non-null  int64  
 4   marca          12191 non-null  object 
 5   informacoes    12191 non-null  object 
 6   motor          11464 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 761.9+ KB


In [34]:
df.columns

Index(['modelo', 'preco', 'quilometragem', 'ano', 'marca', 'informacoes',
       'motor'],
      dtype='object')

In [35]:
df = df[['marca', 'modelo','motor', 'quilometragem', 'ano', 'preco', 'informacoes']]

 - Criando novo arquivo csv com os dados pré-processados.

In [36]:
df.head()

Unnamed: 0,marca,modelo,motor,quilometragem,ano,preco,informacoes
0,chevrolet,agile,1.4,91000.0,2011,25990.0,Agile 1.4 Lt 5p
1,chevrolet,agile,1.4,78400.0,2011,23499.0,Agile 1.4 Ltz 5p
2,chevrolet,agile,1.4,123915.0,2013,28900.0,Agile 1.4 Ltz 5p
3,chevrolet,agile,1.4,91000.0,2013,28000.0,Agile 1.4 Lt 5p
4,chevrolet,agile,1.4,106863.0,2014,31990.0,Agile Agile 1.4 Ltz 8v Flex 4p Manual


In [37]:
df.to_csv('carros_populares_manipulado.csv',index =False , encoding = 'utf-8')