In [1]:
import json
import os
import pandas as pd
import numpy as np
import re

current_dir = os.getcwd()
parent_dir = os.path.join(current_dir, os.pardir, os.pardir)
parent_dir = os.path.abspath(parent_dir)
bronze_dir = os.path.join(parent_dir, 'bronze')
bronze_files = os.listdir(bronze_dir)
silver_dir = os.path.join(parent_dir, 'silver')
silver_files = os.listdir(silver_dir)


In [2]:
keys = []

for file in bronze_files:
    with open(os.path.join(bronze_dir, file), 'r', encoding="utf-8") as f:
        data = json.load(f)
        # Get the keys from the first file and append them if they are not already in the list
        for key in data.keys():
            if key not in keys:
                keys.append(key)

In [3]:
# create a dataframe with all the json files
dicts = []

for file in bronze_files:
    with open(os.path.join(bronze_dir, file), 'r', encoding="utf-8") as f:
        data = json.load(f)
        dicts.append(data)

df = pd.DataFrame(dicts, columns=keys)

df.head()

Unnamed: 0,Production years,"Displacement, cc",Fuel system,"Power output, hp","Torque output, Nm",Cylinder block,Block head,"Cylinder bore, mm","Piston stroke, mm",Compression ratio,...,Optimal revs,Oil change volume including oil filter,Oil filters,"Cooling, overall volume",Dry weight (base engine),"Dry weight, kg","Combustion chamber working volume, cc","Engine lifespan, thousand km— official information",Max RPM,Type
0,2008-2015,1598.0,Common Rail,120.0,300 – 320,cast iron R4,aluminum 16v,79.5,80.5,16.5,...,,,,,,,,,,
1,2000-2002,2387.0,Common Rail,140.0,305.0,cast iron R5,aluminum 10v,82.0,90.4,18.45,...,,,,,,,,,,
2,2002-2010,1970.0,direct injection,165.0,206.0,cast iron R4,aluminum 16v,83.0,91.0,11.3,...,,,,,,,,,,
3,2003-2010,1910.0,Common Rail,150.0,305.0,cast iron R4,aluminum 16v,82.0,90.4,17.5,...,,,,,,,,,,
4,2005-2010,3195.0,direct injection,260.0,322.0,aluminum V6,aluminum 24v,89.0,85.6,11.25,...,,,,,,,,,,


In [6]:
def normalize_production_years(value):
    
    if isinstance(value, float) or isinstance(value, int):
        # Convertir números sueltos en un rango "YYYY-YYYY"
        return f"{int(value)}-{int(value)}"
    
    if not isinstance(value, str):
        return value

    # Reemplazar patrones 'since'
    value = re.sub(r'since (\d{4})', r'\1-\1', value)
    
    # Reemplazar '…' con el mismo año
    value = re.sub(r'(\d{4})-…', r'\1-\1', value)
    
    # Reemplazar espacios y guiones no estándar
    value = re.sub(r'\s+–\s+|\s*–\s*', '-', value)
    
    # Estandarizar fechas tipo '09/2005'
    match = re.match(r'(\d{2})/(\d{4})', value)
    if match:
        return f"{match.group(2)}-{match.group(2)}"

    # Detectar y corregir rangos de años fuera de rango (e.g., '19999-20014')
    match = re.match(r'(\d{4,5})-(\d{4,5})', value)
    if match:
        start_year, end_year = int(match.group(1)), int(match.group(2))
        # Validar si los años están en un rango razonable
        if start_year > 999 and start_year <= 9999 and end_year > 999 and end_year <= 9999:
            return f"{start_year}-{end_year}"
        # Corrige si los años tienen más de 4 dígitos
        if start_year > 9999:
            start_year = start_year // 10
        if end_year > 9999:
            end_year = end_year // 10
        return f"{start_year}-{end_year}"

    # Retornar el valor original si no coincide con ningún patrón conocido
    return value


In [7]:
def clean_displacement(value):
    
    if isinstance(value, float):
        return value
    
    # Extraer el valor numérico del texto
    match = re.match(r'(\d+(?:,\d+)?)(?:\s*dm3|\s*cc)?', value)
    if match:
        num_str = match.group(1)
        # Convertir coma decimal a punto decimal
        num_str = num_str.replace(',', '.')
        num_value = float(num_str)

        # Si el valor estaba en dm³, convertir a cc (1 dm³ = 1000 cc)
        if 'dm3' in value:
            num_value *= 1000
        
        # Convertir a entero si es posible (e.g., 16000.0 -> 16000)
        return int(num_value) if num_value.is_integer() else num_value
    
    return value

In [175]:
def clean_and_average_torque(value):

    if isinstance(value, str):
        # Normalize dashes
        value = re.sub(r'[\u2013\u2014\u2015\-]', '-', value)
        
        # Case 1: Handle "NNN/NNNN" pattern (torque/rpm)
        if re.search(r'\d{3}/\d{4}', value):
            numbers = re.findall(r'\b\d{3}(?=/)', value)
            numeric_values = [float(num) for num in numbers]
            avg_torque = np.mean(numeric_values)
            print('a')
        
        # Case 2: Handle "NNN /NNNN rpm" pattern there is a possible space after the slash
        elif re.match(r'\d{3} / ?\d{4} rpm', value):
            number = re.match(r'(\d{3}) / ?\d{4} rpm', value).group(1)
            avg_torque = float(number)
            print('b')
        
        # Case 3: Handle "NNNNNN (Description)" pattern
        elif re.match(r'\d{6} \(.+\)', value):
            numbers = re.findall(r'\d{3}', value[:6])
            numeric_values = [float(num) for num in numbers]
            avg_torque = np.mean(numeric_values)
            print('c')
        
        # Case 4: Handle continuous digits (multiple torque values)
        elif re.match(r'^(\d{3,4})+$', value):
            numbers = re.findall(r'\d{3,4}', value)
            numeric_values = [float(num) for num in numbers]
            avg_torque = np.mean(numeric_values)
            print('d')
        
        # Case 5: Handle specific pattern 4 torques in a row
        elif len(value) == 14  and value[-2] == '.':
            value = value.split('.')[0]
            numbers = [value[i:i+3] for i in range(0, len(value), 3)]
            numeric_values = [float(num) for num in numbers]
            avg_torque = np.mean(numeric_values)
            print('e')

        # Case 6: Handle "NN – NN NN – NN (Description)" pattern
        elif re.search(r'(\d{2,3}\s*-?\s*\d{2,3}\s*){1,2}\(', value):
            numbers = re.findall(r'\d{2,4}', value)
            print(numbers)
            length = len(numbers[0])
            numeric_values = [float(num) if len(num) == length else float(num[0:length]) for num in numbers]
            print(numeric_values)
            avg_torque = np.mean(numeric_values)

        # Case 7: Handle "NNN (numbers and text)NNN (numbers and text)" pattern
        elif re.search(r'\d{3}\s*\(.+\)\s*\d{3}\s*\(.+\)', value):
            numbers = re.findall(r'\d{3}(?=\s*\()', value)
            print(numbers)
            numeric_values = [float(num) for num in numbers]
            avg_torque = np.mean(numeric_values)
            print('k')

        # Case 8: Handle "NNN (numbers and text)"
        elif re.search(r'\d{3}\s*\(.+\)', value):
            numbers = float(re.findall(r'\d{3}', value)[0])
            avg_torque = np.mean(numbers)
            print('l')

        else:
            # Remove RPM values or any numbers followed by '/'
            value = re.sub(r'\d+/\d+', '', value)
            print('f')
            print(value)
            
            # Find all sequences of 3 to 4 digits
            numbers = re.findall(r'\b\d{3,4}\b', value)
            print(numbers)
            
            if numbers:
                # Convert all found numbers to floats
                numeric_values = [float(num) for num in numbers]
                # Calculate the average if there are multiple numbers
                avg_torque = np.mean(numeric_values)
            else:
                # If no numbers are found, append NaN
                avg_torque = np.nan
    else:
        
        if isinstance(value, float):
            # Case 4.1: Handle continuous digits (multiple torque values)
            if re.match(r'^(\d{3,4})+$', str(value)[:-2]):
                value = str(value)[:-2]
                if len(value) % 3 == 0:
                    numbers = [value[i:i+3] for i in range(0, len(value), 3)]
                elif len(value) % 4 == 0:
                    numbers = [value[i:i+4] for i in range(0, len(value), 4)]
                numeric_values = [float(num) for num in numbers]
                avg_torque = np.mean(numeric_values)
            # Case 5.1: Handle specific pattern 4 torques in a row
            elif len(str(value)) == 14  and str(value)[-2] == '.':
                str_value = str(value).split('.')[0]
                numbers = [str_value[i:i+3] for i in range(0, len(str_value), 3)]
                numeric_values = [float(num) for num in numbers]
                avg_torque = np.mean(numeric_values)

            else:
                avg_torque = value
        else:
            avg_torque = value


    return np.round(avg_torque, 1)


In [177]:
def clean_json_file(data):

    # Check if 'Production Years' field exists and normalize it
    if 'Production years' in data:
        data['Production years'] = normalize_production_years(data['Production years'])

    # Limpieza del campo 'Displacement'
    if 'Displacement' in data:
        cleaned_value = clean_displacement(data['Displacement'])
        # Asignar el valor limpio al campo 'Displacement, cc'
        if 'Displacement, cc' not in data:
            data['Displacement, cc'] = cleaned_value
        del data['Displacement']

    if 'Torque output, Nm' in data:
        data['Torque output, Nm'] = clean_and_average_torque(data['Torque output, Nm'])


    return data

In [178]:
for file in bronze_files:
    with open(os.path.join(bronze_dir, file), 'r', encoding="utf-8") as f:
        data = json.load(f)
        data = clean_json_file(data)

        # Write back the changes to the file in the silver folder
        with open(os.path.join(silver_dir, file), 'w', encoding='utf-8') as silver_file:
            json.dump(data, silver_file, indent=4, ensure_ascii=False) 
        

f
300 - 320
['300', '320']
f
340 - 350
['340', '350']
f
137 - 145
['137', '145']
f
186 - 190
['186', '190']
['180', '2203', '20']
[180.0, 220.0, 20.0]
f
250 - 450
['250', '450']
f
450 - 500
['450', '500']
b
b
b
b
b
b
b
b
b
b
b
b
c
['180', '220', '250']
k
f
220 - 250
['220', '250']
b
b
b
b
b
b
['180', '2001', '90', '210', '46']
[180.0, 200.0, 90.0, 210.0, 46.0]
f
230 - 250
['230', '250']
f
270 - 315
['270', '315']
f
235 - 240
['235', '240']
['270', '53', '30', '310', '320', '53', '30']
[270.0, 53.0, 30.0, 310.0, 320.0, 53.0, 30.0]
['400', '450']
k
['400', '450', '465']
k
b
b
b
b
f
600 - 650
['600', '650']
['600', '650', '750']
k
['780', '870', '74', '66', '800', '850', '74', '66']
[780.0, 870.0, 74.0, 66.0, 800.0, 850.0, 74.0, 66.0]
f
850 - 900
['850', '900']
b
f
152 / 154
['152', '154']
f
350 - 360
['350', '360']
f
280 - 300
['280', '300']
f
350 - 400
['350', '400']
f
175 - 185
['175', '185']
f
210 - 230
['210', '230']
f
255 - 265
['255', '265']
f
245 - 275
['245', '275']
f
350 - 355
[

In [179]:
# create a dataframe with all the json files
dicts = []

for file in silver_files:
    with open(os.path.join(silver_dir, file), 'r', encoding="utf-8") as f:
        data = json.load(f)
        if 'Displacement' in data:
            print(data)
        dicts.append(data)

df = pd.DataFrame(dicts)

df.head()

Unnamed: 0,Production years,"Displacement, cc",Fuel system,"Power output, hp","Torque output, Nm",Cylinder block,Block head,"Cylinder bore, mm","Piston stroke, mm",Compression ratio,...,Optimal revs,Oil change volume including oil filter,Oil filters,"Cooling, overall volume",Dry weight (base engine),"Dry weight, kg","Combustion chamber working volume, cc","Engine lifespan, thousand km— official information",Max RPM,Type
0,2008-2015,1598.0,Common Rail,120.0,310.0,cast iron R4,aluminum 16v,79.5,80.5,16.5,...,,,,,,,,,,
1,2000-2002,2387.0,Common Rail,140.0,305.0,cast iron R5,aluminum 10v,82.0,90.4,18.45,...,,,,,,,,,,
2,2002-2010,1970.0,direct injection,165.0,206.0,cast iron R4,aluminum 16v,83.0,91.0,11.3,...,,,,,,,,,,
3,2003-2010,1910.0,Common Rail,150.0,305.0,cast iron R4,aluminum 16v,82.0,90.4,17.5,...,,,,,,,,,,
4,2005-2010,3195.0,direct injection,260.0,322.0,aluminum V6,aluminum 24v,89.0,85.6,11.25,...,,,,,,,,,,


In [99]:
list(df.columns)

['Production years',
 'Displacement, cc',
 'Fuel system',
 'Power output, hp',
 'Torque output, Nm',
 'Cylinder block',
 'Block head',
 'Cylinder bore, mm',
 'Piston stroke, mm',
 'Compression ratio',
 'Features',
 'Hydraulic lifters',
 'Timing drive',
 'Phase regulator',
 'Turbocharging',
 'Recommended engine oil',
 'Engine oil capacity, liter',
 'Fuel type',
 'Euro standards',
 'Engine lifespan, km',
 'Weight, kg',
 'title',
 'description',
 'city_fuel_consumption',
 'highway_fuel_consumption',
 'combined_fuel_consumption',
 'Manufacturer',
 'Family',
 'Cylinder block alloy',
 'Configuration',
 'Number of cylinders',
 'Valves per cylinder',
 'Torque output, Nm / rpm',
 'Oil consumption, gr/1000 km',
 'Normal engine operating temperature, °C',
 'Also called',
 'Oil change interval, km',
 'Called',
 'Production',
 'Fuel delivery system',
 'Number of valves per cylinder',
 'Piston stroke, mm (inch)',
 'Cylinder bore, mm (inch)',
 'Displacement, cc (cu in)',
 'Power output, HP/rpm',
 'To

In [176]:
file = 'Engine Hyundai-Kia G4NE.json'
with open(os.path.join(bronze_dir, file), 'r', encoding="utf-8") as f:
    data = json.load(f)
    print(data['Torque output, Nm'])
    data = clean_json_file(data)

data

180 (245 with electrical component in 2012-2013 , 319 in 2013-2015)
l


{'Production years': '2012-2015',
 'Displacement, cc': 1999.0,
 'Fuel system': 'distributed injection',
 'Power output, hp': '150 (190 with electrical component in 2012-2013 , 177 in 2013-2015)',
 'Torque output, Nm': np.float64(180.0),
 'Cylinder block': 'aluminum R4',
 'Block head': 'aluminum 16v',
 'Cylinder bore, mm': 81.0,
 'Piston stroke, mm': 97.0,
 'Compression ratio': 12.5,
 'Features': 'Atkinson cycle',
 'Hydraulic lifters': 'yes',
 'Timing drive': 'chain',
 'Phase regulator': 'Dual CVVT',
 'Turbocharging': 'no',
 'Recommended engine oil': '5W-30',
 'Engine oil capacity, liter': 4.3,
 'Fuel type': 'petrol',
 'Euro standards': 'EURO 5',
 'Engine lifespan, km': 250000.0,
 'title': 'Engine Hyundai-Kia G4NE',
 'description': 'The company assembled the 2.0-liter Hyundai G4NE or 2.0 MPi Hybrid engine from 2012 to 2015 and installed it on the hybrid versions of the Sonata 6 and the similar Optima 3 for the Asian market. In the US market, such hybrids were equipped with a 2.4-literG4

In [171]:
filt = df['Torque output, Nm'] == 1256.7
# filt = df['title'] == 'Engine BMW M40B16'
df[filt]['title']
# df[filt]

410    Engine Hyundai-Kia G4NE
Name: title, dtype: object

In [181]:
df['Torque output, Nm / rpm'].unique()

array([nan, '160/4000170/4000170/4000174/4000174/4300',
       '215/4000226/4000222/4300', '255/3500 / 260/4000 / 275/4000',
       '290/4000305/4000310/4000', '360 /4750 rpm',
       '305 /4250 rpm320 /3600 rpm323 /3900 rpm',
       '350 /3250 rpm340 /3250 rpm', '320 /3800 rpm',
       '340 /4900 rpm350 /4900 rpm355 /4900 rpm365 /4900 rpm370 /4900 rpm',
       '500 /3800 rpm',
       '680 /1500-5650 rpm680 /1500-5750 rpm680 /1500-6000 rpm700 /1500-6000 rpm',
       '400 /3900 rpm', '520 /6200 rpm', '130/3400', '131/4200',
       '150/4000', '140/1400', 1200.0, 981.0, 579.0, 1250.0, 2100.0,
       '695,5 / 1300', '3679 at 2100 rpm',
       '5800/1200-2100…6220/1200-2100', '1000 – 1500',
       '200 / 3000240 / 2500240 / 2750260 / 2500', '210/1800', '200/4000',
       '245/4400240/4000234/4000244/4000244/4000244/5200329/3600',
       '167/4000172/4400181/4000206/3200181/5200184/5200186/5600216/3200226/3600240/4800265/3200',
       '353/4400368/4400392/4400', '156/4000156/4800167/4800', 

In [185]:
bmw_filt = df['title'].str.contains('BMW')
bmw_df = df[bmw_filt]
for desc in bmw_df['description']:
    print(desc)

The 1.5-liter BMW B38 3-cylinder engine series has been assembled since 2013 and is installed on cars with front-wheel drive like B38A15, rear-wheel drive like B38B15 and hybrids like B38K15. Also, these units are installed on the Mini: 1.2-liter B38A12A and 1.5-liter B38A15A.B38 gasoline engines made their debut as part of the i8 coupe hybrid power plant, but conventional modifications soon appeared. By design, there is an aluminum block with plasma-sprayed steel and a closed jacket, an aluminum 12-valve cylinder head equipped with hydraulic lifters and direct fuel injection, Vanos phase shifters on both camshafts, plus a Valvetronic system and a timing chain drive. The engine is supercharged by a single water-cooled Continental turbocharger. It should also be noted the presence of a balancing shaft and a Bosch MEVD 17.2.3 control unit.B38A15U0: 102 hp, 180 Nm. The engine was installed on:B38A15U1: 109 hp, 190 Nm. The engine was installed on:B38A15M0: 136 hp, 220Nm. The engine was ins