In [2]:
import pandas as pd
import numpy as np
import os
import datetime
from scipy import stats
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
filepath = 'data/standvirtual_cars_updated_200410.csv'
df = pd.read_csv(filepath)

cols_trans = pd.read_csv('data/column_translations.csv')

In [56]:
repeated_cols = ['Combustível',
                'Mês de Registo',
                'Ano de Registo',
                'Quilómetros',
                'Potência',
                'Valor Fixo',
                'Origem']

df = df.drop(columns=repeated_cols)

## Replace Portuguese column names with English
cols_ix = np.array([np.where(cols_trans['port'] == x)[0][0] for x in df.columns])
df.columns = cols_trans.loc[cols_ix,'eng'].values

## Price negotiable clean

In [52]:
def table_clean(df):
    ## Replace with English and other cleans
    if 'price_neg' in df:
        df.loc[df['price_neg'] == 'Valor Fixo', 'price_neg'] = 0
        df.loc[df['price_neg'] == 'Negociável', 'price_neg'] = 1
        df.loc[df['price_neg'] == 'Negociável                                                , Valor negociável', 'price_neg'] = 1
        df.loc[df['price_neg'] == 'Valor Fixo                                                 , Valor negociável'] = np.nan

    if 'advertiser' in df:
        df.loc[df['advertiser'] == 'Particular', 'advertiser'] = 'individial'
        df.loc[df['advertiser'] == 'Profissional', 'advertiser'] = 'professional'

    if 'brand' in df:
        df.loc[df['brand'] == 'Outra não listada', 'brand'] = 'other'

    if 'cylinder' in df:
        df['cylinder'] = pd.to_numeric(df['cylinder'].str.replace('cm3','').str.strip().str.replace(' ',''))

    if 'segment' in df:
        df.loc[df['segment'] == 'Coupé', 'segment'] = 'coupe'
        df.loc[df['segment'] == 'Utilitário', 'segment'] = 'utilitary'
        df.loc[df['segment'] == 'Carrinha', 'segment'] = 'van'
        df.loc[df['segment'] == 'Monovolume', 'segment'] = 'mini_van'
        df.loc[df['segment'] == 'Pequeno citadino', 'segment'] = 'city_small'
        df.loc[df['segment'] == 'Citadino', 'segment'] = 'city'
        df.loc[df['segment'] == 'SUV / TT', 'segment'] = 'suv'
        df['segment'] = df['segment'].str.lower()

    if 'color' in df:
        df.loc[df['color'] == 'Branco', 'color'] = 'white'
        df.loc[df['color'] == 'Cinzento', 'color'] = 'gray'
        df.loc[df['color'] == 'Azul', 'color'] = 'blue'
        df.loc[df['color'] == 'Preto', 'color'] = 'black'
        df.loc[df['color'] == 'Prateado', 'color'] = 'silver'
        df.loc[df['color'] == 'Castanho', 'color'] = 'brown'
        df.loc[df['color'] == 'Vermelho', 'color'] = 'red'
        df.loc[df['color'] == 'Laranja', 'color'] = 'orange'
        df.loc[df['color'] == 'Verde', 'color'] = 'green'
        df.loc[df['color'] == 'Outra', 'color'] = 'other'
        df.loc[df['color'] == 'Bege', 'color'] = 'beige'
        df.loc[df['color'] == 'Roxo', 'color'] = 'purple'
        df.loc[df['color'] == 'Dourado', 'color'] = 'golden'
        df.loc[df['color'] == 'Amarelo', 'color'] = 'yellow'

    if 'metallic' in df:
        df.loc[df['metallic'] == 'Sim', 'metallic'] = 1

    if 'gear_type' in df:
        df.loc[df['gear_type'] == 'Automática', 'gear_type'] = 'automatic'
        df.loc[df['gear_type'] == 'Manual', 'gear_type'] = 'manual'
        df.loc[df['gear_type'] == 'Semi-automática', 'gear_type'] = 'semiauto'

    if 'revisions_book_complete' in df:
        df.loc[df['revisions_book_complete'] == 'Sim', 'revisions_book_complete'] = 1

    if 'non_smoker' in df:
        df.loc[df['non_smoker'] == 'Sim', 'non_smoker'] = 1

    if '2nd_key' in df:
        df.loc[df['2nd_key'] == 'Sim', '2nd_key'] = 1

    if 'consumption_urban' in df:
        df['consumption_urban'] = pd.to_numeric(df['consumption_urban'].str.split(' ').str[0].str.replace(',','.'))

    if 'consumption_extra_urban' in df:
        df['consumption_extra_urban'] = pd.to_numeric(df['consumption_extra_urban'].str.split(' ').str[0].str.replace(',','.'))

    if 'consumption_combined' in df:
        df['consumption_combined'] = pd.to_numeric(df['consumption_combined'].str.split(' ').str[0].str.replace(',','.'))

    if 'open_ceiling' in df:
        df.loc[df['open_ceiling'] == 'Tecto de Abrir Panorâmico', 'open_ceiling'] = 'panoramic'
        df.loc[df['open_ceiling'] == 'Tecto de Abrir Elétrico', 'open_ceiling'] = 'electric'
        df.loc[df['open_ceiling'] == 'Tecto de Abrir Manual', 'open_ceiling'] = 'manual'

    if 'alloy_wheels' in df:
        df.loc[df['alloy_wheels'] == 'Sim', 'alloy_wheels'] = 1
        df.loc[df['alloy_wheels'] == '17', 'alloy_wheels'] = 1

    if 'alloy_wheels_size' in df:
        df['alloy_wheels_size'] = pd.to_numeric(df['alloy_wheels_size'].str.split(' ').str[0].str.replace(',','.'))

    if 'upholstery' in df:
        df.loc[df['upholstery'] == 'Estofos de Tecido', 'upholstery'] = 'fabric'
        df.loc[df['upholstery'] == 'Estofos de Pele', 'upholstery'] = 'leather'

    if 'vehicle_condition' in df:
        df.loc[df['vehicle_condition'] == 'Usados', 'vehicle_condition'] = 'used'
        df.loc[df['vehicle_condition'] == 'Novos', 'vehicle_condition'] = 'new'

    if 'accepts_recovery' in df:
        df.loc[df['accepts_recovery'] == 'Sim', 'accepts_recovery'] = 1

    if 'mechancal_guaranty_until_date' in df:
        df['mechancal_guaranty_until_date'] = pd.to_numeric(df['mechancal_guaranty_until_date'].str.split('/').str[::-1].str.join(''))

    if 'mechancal_guaranty_until_mileage' in df:
        df['mechancal_guaranty_until_mileage'] = pd.to_numeric(df['mechancal_guaranty_until_mileage'].str.replace(' km','').str.replace(' ','').str.replace(',','.'))

    if 'finance_possible' in df:
        df.loc[df['finance_possible'] == 'Sim', 'finance_possible'] = 1

    if 'stand_guaranty_in_price' in df:
        df['stand_guaranty_in_price'] = pd.to_numeric(df['stand_guaranty_in_price'].str.replace(' Meses','').str.replace(' ',''))

    if 'iuc' in df:
        df['iuc'] = pd.to_numeric(df['iuc'].str.split(' ').str[0].str.replace(',','.'))

    if 'inspection_validity_date' in df:
        df['inspection_validity_date'] = pd.to_numeric(df['inspection_validity_date'].str.split('/').str[::-1].str.join(''))

    if 'electric_canopy' in df:
        df.loc[df['electric_canopy'] == 'Sim', 'electric_canopy'] = 1

    if 'vat_deductable' in df:
        df.loc[df['vat_deductable'] == 'Sim', 'vat_deductable'] = 1

    if 'price_without_iuc' in df:
        df.loc[df['price_without_iuc'] == 'Sim', 'price_without_iuc'] = 1

    if 'canopy' in df:
        df.loc[df['canopy'] == 'Capota de Lona', 'canopy'] = 'canvas'
        df.loc[df['canopy'] == 'Capota Rígida', 'canopy'] = 'rigid'
        df.loc[df['canopy'] == 'Capota Hardtop', 'canopy'] = 'hardtop'

    if 'max_range' in df:
        df['max_range'] = pd.to_numeric(df['max_range'].str.replace(' km','').str.replace(' ','').str.replace(',','.'))

    if 'saved' in df:
        df.loc[df['saved'] == 'Sim', 'saved'] = 1

    if 'classic' in df:
        df.loc[((df['classic'] == 'Sim') | (df['classic'] == 'true')), 'classic'] = 1

    if 'price_without_isv' in df:
        df.loc[df['price_without_isv'] == 'Sim', 'price_without_isv'] = 1

    if 'stand_guaranty_not_in_price' in df:
        df['stand_guaranty_not_in_price'] = pd.to_numeric(df['stand_guaranty_not_in_price'].str.replace(' Meses',''))

    if 'air_conditioning' in df:
        df.loc[df['air_conditioning'] == 'AC Automático', 'air_conditioning'] = 'automatic'
        df.loc[df['air_conditioning'] == 'AC Manual', 'air_conditioning'] = 'manual'
        df.loc[df['air_conditioning'] == 'AC Independente', 'air_conditioning'] = 'independent'

    if 'class' in df:
        df['class'] = pd.to_numeric(df['class'].str.split(' ').str[-1])

    if 'traction' in df:
        df.loc[df['traction'] == 'Tracção traseira', 'traction'] = 'back'
        df.loc[df['traction'] == 'Tracção dianteira', 'traction'] = 'front'
        df.loc[df['traction'] == 'Integral', 'traction'] = 'both'

    if 'co2_emissions' in df:
        df['co2_emissions'] = pd.to_numeric(df['co2_emissions'].str.replace(' g/km','').str.replace(',','.').str.replace(' ',''))

    if 'particle_filter' in df:
        df.loc[df['particle_filter'] == 'Sim', 'particle_filter'] = 1

    if 'fuel_type' in df:
        df.loc[df['fuel_type'] == 'Gasolina', 'fuel_type'] = 'gasoline'
        df.loc[df['fuel_type'] == 'Híbrido (Gasolina)', 'fuel_type'] = 'hybrid_gasoline'
        df.loc[df['fuel_type'] == 'Híbrido (Diesel)', 'fuel_type'] = 'hybrid_diesel'
        df.loc[df['fuel_type'] == 'Eléctrico', 'fuel_type'] = 'electric'
        df['fuel_type'] = df['fuel_type'].str.lower()

    if 'reg_month' in df:
        df.loc[df['reg_month'] == 'Janeiro', 'reg_month'] = 'january'
        df.loc[df['reg_month'] == 'Fevereiro', 'reg_month'] = 'february'
        df.loc[df['reg_month'] == 'Março', 'reg_month'] = 'march'
        df.loc[df['reg_month'] == 'Abril', 'reg_month'] = 'april'
        df.loc[df['reg_month'] == 'Maio', 'reg_month'] = 'may'
        df.loc[df['reg_month'] == 'Junho', 'reg_month'] = 'june'
        df.loc[df['reg_month'] == 'Julho', 'reg_month'] = 'july'
        df.loc[df['reg_month'] == 'Agosto', 'reg_month'] = 'august'
        df.loc[df['reg_month'] == 'Setembro', 'reg_month'] = 'september'
        df.loc[df['reg_month'] == 'Outubro', 'reg_month'] = 'october'
        df.loc[df['reg_month'] == 'Novembro', 'reg_month'] = 'november'
        df.loc[df['reg_month'] == 'Dezembro', 'reg_month'] = 'december'

    return df


## Save

In [60]:
savepath = filepath.replace('.csv','_clean.csv')
df.to_csv(savepath, index=False)