In [7]:
import json
import os
import pandas as pd
import numpy as np
import re

current_dir = os.getcwd()
parent_dir = os.path.join(current_dir, os.pardir, os.pardir)
parent_dir = os.path.abspath(parent_dir)

bronze_dir = os.path.join(parent_dir, 'datasets', 'motorlist', '2_bronze')
bronze_files = os.listdir(bronze_dir)
silver_dir = os.path.join(parent_dir, 'datasets', 'motorlist', '3_silver')
silver_files = os.listdir(silver_dir)
motorlist_gold_dir = os.path.join(parent_dir, 'datasets', 'motorlist', '4_gold')
motolist_gold_files = os.listdir(motorlist_gold_dir)
motorlist_relationships_dir = os.path.join(parent_dir, 'datasets', 'motorlist', 'relationships')

km77_dir = os.path.join(parent_dir, 'datasets', 'km77')
km77_data_dir = os.path.join(km77_dir, 'Transformed_data')
km77_folders = os.listdir(km77_data_dir)
km77_gold_dir = os.path.join(km77_dir, '4_gold')
km77_gold_files = os.listdir(km77_gold_dir)
km77_relationships_dir = os.path.join(km77_dir, 'relationships')

In [4]:
brand = 'Audi'

### 1. KM 77

In [None]:
# get the final xlsx file for the brand in the km77 relationships directory
brand_relationships_file = os.path.join(km77_relationships_dir, f'{brand.lower()}_cars_finalizado.xlsx')

# read the final xlsx file for the brand
brand_relationships_df = pd.read_excel(brand_relationships_file)
# Confirm that the columns type is string
brand_relationships_df = brand_relationships_df.astype(str)
brand_relationships_df

In [8]:
# create a brand directory in the km77 gold directory if it does not exist
if brand not in km77_gold_files:
    os.mkdir(os.path.join(km77_gold_dir, brand))

# get all the files in the brand directory in the km77_folders
brand_dir = os.path.join(km77_data_dir, brand)
brand_files = os.listdir(brand_dir)

In [17]:
# open all the brand files, change the year value to integer, create a new key called "cleaned_name"
# this cleaned name is the name found in the brand_relationships_df in the cleaned_name column
# which corresponds to the row where the brand_relationships_df in the "km77_name" column is equal to the value of the key "name" in the file
# save the file in the brand directory in the km77 gold directory
for file in brand_files:
    file_path = os.path.join(brand_dir, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data['year'] = int(data['year'])
        data['cleaned_name'] = brand_relationships_df[brand_relationships_df['km77_name'] == data['name']]['cleaned_name'].values[0]
        with open(os.path.join(km77_gold_dir, brand, file), 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

### 2. MOTORLIST

In [35]:
# get the final xlsx file for the brand in the motorlist relationships directory
brand_relationships_file = os.path.join(motorlist_relationships_dir, f'{brand.lower()}_cars_finalizado.xlsx')

# read the final xlsx file for the brand
brand_relationships_df = pd.read_excel(brand_relationships_file)
# Confirm that the columns type is string
brand_relationships_df = brand_relationships_df.astype(str)
brand_relationships_df

Unnamed: 0,cleaned_name,motorlist_name,brand
0,100,Audi 100,Audi
1,100,Audi 100 C3,Audi
2,100,Audi 100 C4,Audi
3,80,Audi 80,Audi
4,80,Audi 80 B2,Audi
5,80,Audi 80 B3,Audi
6,80,Audi 80 B4,Audi
7,90,Audi 90 B3,Audi
8,A1,Audi A1,Audi
9,A2,Audi A2,Audi


In [41]:
# open the silver files in the motorlist silver directory, look at the value of the "car_info key" in the file
# this value is a list of list where the first element of the each list is the name of the car
# if that name is found in the brand_relationships_df in the motorlist_name column
# check each name in the list of list and find the corresponding cleaned name in the brand_relationships_df
# create a new key called "cleaned_name" in the file and inside that key save a list of the cleaned names found
# save the file in the brand directory in the motorlist gold directory

for file in silver_files:
    file_path = os.path.join(silver_dir, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        car_info = np.array(data['car_info'])
        if len(car_info) == 0:
            continue
        else:
            car_info = car_info[:, 0]
            cleaned_names = []
            brand_names = []
            for name in car_info:
                if name in brand_relationships_df['motorlist_name'].values:
                    cleaned_name = brand_relationships_df[brand_relationships_df['motorlist_name'] == name]['cleaned_name'].values[0]
                    brand_name = brand_relationships_df[brand_relationships_df['motorlist_name'] == name]['brand'].values[0]
                    cleaned_names.append(cleaned_name)
                    brand_names.append(brand_name)
            # If there are clenaned names, save the file
            if len(cleaned_names) > 0:
                data['cleaned_name'] = cleaned_names
                data['car_brands'] = brand_names
                with open(os.path.join(motorlist_gold_dir, file), 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=4, ensure_ascii=False)
            else:
                continue

