In [11]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import re

In [12]:
df = pd.read_csv('train.csv')

In [13]:
Correct_Max_Power = []
for power in df['max_power']:
    if pd.notna(power):
        if isinstance(power, str):
            if 'bhp' in power:
                num_bhp = float(re.search(r'\d+.?\d', power).group())
                Correct_Max_Power.append(num_bhp)
            elif 'PS' in power:
                num_ps = float(re.search(r'\d+.?\d', power).group())
                num_bhp = round(num_ps * 0.98632, 2)
                Correct_Max_Power.append(num_bhp)
            else:
                num = re.search(r'\d+\.?\d*', power)
                Correct_Max_Power.append(float(num.group()) if num else None)
        else:
            Correct_Max_Power.append(power)
    else:
        Correct_Max_Power.append(None)


# Converter os valores não NaN para float
Correct_Max_Power = [float(value) if value is not None else value for value in Correct_Max_Power]

df['max_power'] = Correct_Max_Power


In [14]:
# Remover linhas com NaN na coluna 'max_power'
df = df.dropna(subset=['max_power'])
print(df['max_power'])

0         83.8
1         88.7
2        117.3
3        167.6
4         83.1
         ...  
13951    197.2
13952     81.8
13953    108.6
13954     88.5
13955     86.7
Name: max_power, Length: 13867, dtype: float64


In [15]:
# Change the data types of multiple columns
df = df.astype({
    
    'max_power': float,
    
})

In [16]:
# Check data types
data_types = df.dtypes
print("Data type by column:")
print(data_types)

Data type by column:
full_name             object
registered_year       object
engine_capacity       object
insurance             object
transmission_type     object
kms_driven            object
owner_type            object
fuel_type             object
max_power            float64
seats                float64
mileage               object
body_type             object
city                  object
resale_price_Lakh    float64
dtype: object


In [17]:
# create groups: max_power

# define os limites dos grupos
limites = [0, 100, 200, 300, 400, 500, 600]  
rotulos = ['0-100 bhp', '100-200 bhp', '200-300 bhp', '300-400 bhp', '400-500 bhp', '500-600 bhp']

# nova coluna 'max_power_group'
df['max_power_group'] = pd.cut(df['max_power'], bins=limites, labels=rotulos, right=False)

print(df)


                                               full_name registered_year  \
0                                2019 Tata Tiago XZ Plus        Nov 2019   
1                              2018 Honda WR-V i-VTEC VX        Apr 2018   
2                              2015 Honda City i VTEC SV        May 2015   
3      2021 Tata New Safari XZA Plus Adventure Editio...            2021   
4                        2019 Maruti Baleno 1.2 CVT Zeta        Jun 2019   
...                                                  ...             ...   
13951                          2021 Kia Carnival Premium            2021   
13952               2017 Hyundai i20 1.2 Magna Executive        Dec 2017   
13953             2017 Skoda Rapid 1.5 TDI Ambition BSIV        Jan 2018   
13954                      2017 Maruti Vitara Brezza ZDi        Apr 2017   
13955                         2015 Honda Amaze S i-Vtech        Mar 2015   

      engine_capacity              insurance transmission_type  kms_driven  \
0        

In [18]:
# unique 'max_power_group'
valores_unicos_kms_group= df['max_power_group'].unique()
print(valores_unicos_kms_group)

['0-100 bhp', '100-200 bhp', '200-300 bhp', '300-400 bhp', '500-600 bhp', '400-500 bhp']
Categories (6, object): ['0-100 bhp' < '100-200 bhp' < '200-300 bhp' < '300-400 bhp' < '400-500 bhp' < '500-600 bhp']


In [19]:
# Check ID_kms

# Sua coluna de tipos de combustível
max_power_group = df['max_power_group']

# Número de documentos (ou entradas) no DataFrame
n_docs = len(df)

idf = {}

# Usando drop_duplicates para obter valores únicos
unique_max_power_group = max_power_group.drop_duplicates()

for max_power_group in unique_max_power_group:
    k = sum(max_power_group in row for row in df['max_power_group'])  # contagem de documentos que contêm este tipo de combustível
    idf[max_power_group] = np.log10(n_docs / (k + 1))  # Adicionei +1 para evitar divisão por zero

for max_power_group, value in idf.items():
    print(f'{max_power_group:>15}: {value:>10}')




      0-100 bhp: 0.2193089476942497
    100-200 bhp: 0.4385191736695107
    200-300 bhp: 1.5737807914858088
    300-400 bhp: 2.3426419660992224
    500-600 bhp: 3.2388925285608603
    400-500 bhp: 3.1005898303945787


In [20]:
# Check ID_kms

# Sua coluna de tipos de combustível
max_power_group = df['max_power_group']

# Número de documentos (ou entradas) no DataFrame
n_docs = len(df)

idf = {}

# Usando drop_duplicates para obter valores únicos
unique_max_power_group = max_power_group.drop_duplicates()

for max_power_group in unique_max_power_group:
    k = sum(max_power_group in row for row in df['max_power_group'])  # contagem de documentos que contêm este tipo de combustível
    idf[max_power_group] = np.log10(n_docs / (k + 1))  # Adicionei +1 para evitar divisão por zero

for max_power_group, value in idf.items():
    print(f'{max_power_group:>15}: {value:>10}')


# Criar nova coluna 'idf' no DataFrame
df['id_max_power_group'] = df['max_power_group'].map(idf)

# Se desejar imprimir o DataFrame com a nova coluna
print(df)

      0-100 bhp: 0.2193089476942497
    100-200 bhp: 0.4385191736695107
    200-300 bhp: 1.5737807914858088
    300-400 bhp: 2.3426419660992224
    500-600 bhp: 3.2388925285608603
    400-500 bhp: 3.1005898303945787
                                               full_name registered_year  \
0                                2019 Tata Tiago XZ Plus        Nov 2019   
1                              2018 Honda WR-V i-VTEC VX        Apr 2018   
2                              2015 Honda City i VTEC SV        May 2015   
3      2021 Tata New Safari XZA Plus Adventure Editio...            2021   
4                        2019 Maruti Baleno 1.2 CVT Zeta        Jun 2019   
...                                                  ...             ...   
13951                          2021 Kia Carnival Premium            2021   
13952               2017 Hyundai i20 1.2 Magna Executive        Dec 2017   
13953             2017 Skoda Rapid 1.5 TDI Ambition BSIV        Jan 2018   
13954                   