First of all, let's download the file. Thankfully wheelsaroundme allows public access to his Google Sheet spreadsheet, so we don't need to create and API key yada-yada-yada. Then we clean the data up: (check wheelsaroundme.ipynb for more details)

1. Some users put the "CC mesin" (engine cc) column in liters instaed
2. Gearbox values are wild
3. One user put "A" in the "Tahun" (model year) column 
4. "Konsumsi dalam kota (KM/L)" (city fuel consumption in km/L) and "Konsumsi luar kota (KM/L)" (highway fuel consumption in km/L) values are also wild

In [67]:
from pathlib import Path

import pandas as pd

if Path("wam.csv").exists():
    df = pd.read_csv("wam.csv")
else:
    url = "https://docs.google.com/spreadsheets/d/1ttSU56TcDoZBzcCAIp7VPEjHDIcIuyi8wtb5FO3V2bA/export?format=csv"
    df = pd.read_csv(url)
    df.to_csv('wam.csv')

# cc
df.loc[df['CC mesin'] < 100, 'CC mesin'] *= 1000
# gearbox
df["gearbox_normalized"] = "AT"
df.loc[df["GEARBOX"].isin(["4MT", "5MT", "MT"]), "gearbox_normalized"] = "MT"
df.loc[df["GEARBOX"].isin(["CVT"]), "gearbox_normalized"] = "CVT"
# Tahun (model year)
df.loc[df["Tahun"].isin(["A"]), "Tahun"] = 1992
df["Tahun"] = df["Tahun"].astype("int")
# Fuel cons
df["city"] = pd.to_numeric(df["Konsumsi dalam kota (KM/L)"], errors='coerce')
df["hwy"] = pd.to_numeric(df["Konsumsi luar kota (KM/L)"], errors='coerce')
df = df.dropna(subset=["city"])
df = df.dropna(subset=["hwy"])
df = df.query("hwy < 100 & city < 100")
# brand
df['brand'] = df['Jenis Kendaraan'].str.split().str[0].str.lower()
df.loc[df["brand"].isin(["grandmax"]), "brand"] = "daihatsu"
df.loc[df["brand"].isin(["mercedes"]), "brand"] = "mercedes-benz"
df.loc[df["brand"].isin(["opel/chevrolet"]), "brand"] = "chevrolet"
df.loc[df["brand"].isin(["karimun"]), "brand"] = "suzuki"
df.loc[df["brand"].isin(["totota"]), "brand"] = "toyota"
df.loc[df["brand"].isin(["volkswagen"]), "brand"] = "vw"
# finally
df = df.loc[:, ['Tahun', "brand", "CC mesin", "gearbox_normalized", "city", "hwy"]]
df

Unnamed: 0,Tahun,brand,CC mesin,gearbox_normalized,city,hwy
0,2011,audi,1800.0,CVT,8.0,12.0
4,2015,bmw,1500.0,AT,12.0,16.0
5,2017,bmw,1500.0,AT,12.5,19.0
6,1997,bmw,1800.0,MT,7.0,11.5
7,1990,bmw,1800.0,MT,6.0,10.0
...,...,...,...,...,...,...
1118,2012,mini,1600.0,AT,9.0,14.0
1119,2001,toyota,3000.0,AT,6.1,7.0
1121,1980,mitsubishi,1300.0,AT,6.0,9.0
1129,1991,bmw,2500.0,AT,5.0,7.0


In [68]:
import numpy as np

all_brands_list = df["brand"].unique()
all_gearboxes_list = df["gearbox_normalized"].unique()
print(all_brands_list)
print(all_gearboxes_list)

df['brand_idx'] = df['brand'].apply(lambda x: int(np.where(all_brands_list == x)[0]))
df['gearbox_idx'] = df['gearbox_normalized'].apply(lambda x: int(np.where(all_gearboxes_list == x)[0]))

cc_max = df['CC mesin'].max()
cc_min = df['CC mesin'].min()
print(cc_min, cc_max)

year_max = df['Tahun'].max()
year_min = df['Tahun'].min()
print(year_min, year_max)

fuelcons_max = max(df["city"].max(), df["hwy"].max())
fuelcons_min = min(df["city"].min(), df["hwy"].min())
print(fuelcons_min, fuelcons_max)

df['cc_normalized'] = (df['CC mesin'] - cc_min) / (cc_max - cc_min)
df['year_normalized'] = (df['Tahun'] - year_min) / (year_max - year_min)
df['city_normalized'] = (df['city'] - fuelcons_min) / (fuelcons_max - fuelcons_min)
df['hwy_normalized'] = (df['hwy'] - fuelcons_min) / (fuelcons_max - fuelcons_min)
df

['audi' 'bmw' 'cadillac' 'chevrolet' 'daihatsu' 'datsun' 'ford' 'honda'
 'hyundai' 'jaguar' 'jeep' 'kia' 'land' 'lexus' 'mazda' 'mercedes-benz'
 'mg' 'mini' 'mitsubishi' 'morris' 'nissan' 'peugeot' 'proton' 'range'
 'renault' 'subaru' 'suzuki' 'timor' 'toyota' 'volvo' 'vw' 'wuling'
 'daewoo']
['CVT' 'AT' 'MT']
849.0 7800.0
1961 2022
2.0 25.0


Unnamed: 0,Tahun,brand,CC mesin,gearbox_normalized,city,hwy,brand_idx,gearbox_idx,cc_normalized,year_normalized,city_normalized,hwy_normalized
0,2011,audi,1800.0,CVT,8.0,12.0,0,0,0.136815,0.819672,0.260870,0.434783
4,2015,bmw,1500.0,AT,12.0,16.0,1,1,0.093656,0.885246,0.434783,0.608696
5,2017,bmw,1500.0,AT,12.5,19.0,1,1,0.093656,0.918033,0.456522,0.739130
6,1997,bmw,1800.0,MT,7.0,11.5,1,2,0.136815,0.590164,0.217391,0.413043
7,1990,bmw,1800.0,MT,6.0,10.0,1,2,0.136815,0.475410,0.173913,0.347826
...,...,...,...,...,...,...,...,...,...,...,...,...
1118,2012,mini,1600.0,AT,9.0,14.0,17,1,0.108042,0.836066,0.304348,0.521739
1119,2001,toyota,3000.0,AT,6.1,7.0,28,1,0.309452,0.655738,0.178261,0.217391
1121,1980,mitsubishi,1300.0,AT,6.0,9.0,18,1,0.064883,0.311475,0.173913,0.304348
1129,1991,bmw,2500.0,AT,5.0,7.0,1,1,0.237520,0.491803,0.130435,0.217391


In [69]:
import torch
from torch import nn

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

X_columns = ['brand_idx', 'gearbox_idx'] + ['year_normalized', 'cc_normalized']
X = torch.tensor(df[X_columns].values)
y_city = torch.tensor(df["city_normalized"].values)
y_city = y_city.reshape(-1, 1)
y_hwy = torch.tensor(df["hwy_normalized"].values)
y_hwy = y_hwy.reshape(-1, 1)
X

Using cuda device


tensor([[ 0.0000,  0.0000,  0.8197,  0.1368],
        [ 1.0000,  1.0000,  0.8852,  0.0937],
        [ 1.0000,  1.0000,  0.9180,  0.0937],
        ...,
        [18.0000,  1.0000,  0.3115,  0.0649],
        [ 1.0000,  1.0000,  0.4918,  0.2375],
        [28.0000,  1.0000,  0.4426,  0.0937]], dtype=torch.float64)

Now let's try GAN. We first generate the Generator and Discriminator models

For the embedding vector, we use a common embedding vectors for both the generator and discriminator, because we simply don't have sufficient data to train such embedding layers. 

(Also mainly because I couldn't figure out how to get the gradients to pass through an argmin/argmax. Pls send help lol)

In [70]:
latent_vector_dim = 1000
embedding_dim = 10
brand_embedding = nn.Embedding(len(all_brands_list), embedding_dim)
gearbox_embedding = nn.Embedding(len(all_gearboxes_list), embedding_dim)


class Generator(nn.Module):
    def __init__(self, brand_embedding, gearbox_embedding, model_output_dim=embedding_dim + 2):
        super(Generator, self).__init__()
        self.brand_embedding = brand_embedding
        self.gearbox_embedding = gearbox_embedding
        self.model = nn.Sequential(
            nn.Linear(latent_vector_dim + 2, model_output_dim // 4),
            nn.LeakyReLU(0.2, inplace=True),
            nn.BatchNorm1d(model_output_dim // 4),
            nn.Linear(model_output_dim // 4, model_output_dim // 2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.BatchNorm1d(model_output_dim // 2),
            nn.Linear(model_output_dim // 2, model_output_dim),
            nn.Sigmoid(),
        )

    def reverse_embedding(self, output, emb_layer):  # https://discuss.pytorch.org/t/reverse-nn-embedding/142623/
        emb_weights = emb_layer.weight
        emb_size = output.size(0), output.size(1), -1, -1
        out_size = -1, -1, emb_weights.size(0), -1
        return torch.argmin(
            torch.abs(output.unsqueeze(2).expand(out_size) - emb_weights.unsqueeze(0).unsqueeze(0).expand(emb_size))
            .sum(dim=3), dim=2)

    def forward(self, x):
        return_model = self.model(x)
        brand_idx = self.reverse_embedding(return_model[:, 2:].unsqueeze(0), self.brand_embedding)
        gearbox_idx = self.reverse_embedding(return_model[:, 2:].unsqueeze(0), self.gearbox_embedding)
        return torch.cat([
            brand_idx.t(),
            gearbox_idx.t(),
            return_model[:, 0].unsqueeze(1),  # year
            return_model[:, 1].unsqueeze(1),  # cc
        ], dim=1)


class Discriminator(nn.Module):
    def __init__(self, brand_embedding, gearbox_embedding):
        super(Discriminator, self).__init__()
        self.brand_embedding = brand_embedding
        self.gearbox_embedding = gearbox_embedding
        self.input_tensor_dim = embedding_dim * 2 + 4  # 4 = year + cc + dalkot + lukot
        self.model = nn.Sequential(
            nn.Linear(self.input_tensor_dim, self.input_tensor_dim // 2),
            nn.ReLU(),
            nn.Linear(self.input_tensor_dim // 2, self.input_tensor_dim // 4),
            nn.ReLU(),
            nn.Linear(self.input_tensor_dim // 4, 1),
            nn.Sigmoid()
        )

    def combine_features(self, x):
        return torch.cat([
            x[:, 0].unsqueeze(1),  # dalkot
            x[:, 1].unsqueeze(1),  # lukot
            self.brand_embedding(x[:, 2].long()),
            self.gearbox_embedding(x[:, 3].long()),
            x[:, 4].unsqueeze(1),  # year
            x[:, 5].unsqueeze(1),  # cc
        ], dim=1)

    def forward(self, x):
        x = self.combine_features(x)
        return self.model(x)


generator = Generator(brand_embedding=brand_embedding, gearbox_embedding=gearbox_embedding)
discriminator = Discriminator(brand_embedding=brand_embedding, gearbox_embedding=gearbox_embedding)

print("THE GENERATOR:", generator)
print("THE DISCRIMINATOR:", discriminator)

THE GENERATOR: Generator(
  (brand_embedding): Embedding(33, 10)
  (gearbox_embedding): Embedding(3, 10)
  (model): Sequential(
    (0): Linear(in_features=1002, out_features=3, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=3, out_features=6, bias=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=6, out_features=12, bias=True)
    (7): Sigmoid()
  )
)
THE DISCRIMINATOR: Discriminator(
  (brand_embedding): Embedding(33, 10)
  (gearbox_embedding): Embedding(3, 10)
  (model): Sequential(
    (0): Linear(in_features=24, out_features=12, bias=True)
    (1): ReLU()
    (2): Linear(in_features=12, out_features=6, bias=True)
    (3): ReLU()
    (4): Linear(in_features=6, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


Let's generate a sample data, so we get some idea what the generator would come up with

In [71]:
city_labels = y_city.detach().clone().reshape(-1, 1).float()
city_labels = (city_labels - fuelcons_min) / (fuelcons_max - fuelcons_min)
hwy_labels = y_hwy.detach().clone().reshape(-1, 1).float()
hwy_labels = (hwy_labels - fuelcons_min) / (fuelcons_max - fuelcons_min)

true_labels = torch.cat((city_labels, hwy_labels), dim=1)
noise = torch.rand(size=(len(X), latent_vector_dim))

combined = torch.cat((true_labels, noise), dim=1)

generated_data = generator(combined)
print(generated_data)  # random, untrained

tensor([[21.0000,  0.0000,  0.8621,  0.4639],
        [21.0000,  0.0000,  0.4473,  0.5300],
        [21.0000,  0.0000,  0.5833,  0.5716],
        ...,
        [21.0000,  0.0000,  0.7893,  0.4524],
        [21.0000,  0.0000,  0.5267,  0.5767],
        [21.0000,  0.0000,  0.3292,  0.5320]], grad_fn=<CatBackward0>)


Let's train both the Generator and Discriminator together

In [72]:
lr = 0.001
batch_size = len(X)
true_data = X.detach().clone().float()
true_labels_data = torch.cat((true_labels, true_data), dim=1)

real_label = torch.full((batch_size, 1), 1.)  # Discriminator output 1 = real
fake_label = torch.full((batch_size, 1), 0.)  # Discriminator output 0 = fake


def train():
    generator_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)
    discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)
    loss = nn.BCELoss()

    for epoch in range(1000):
        generator.train()
        discriminator.train()
        generator_optimizer.zero_grad()
        discriminator_optimizer.zero_grad()

        noise = torch.rand(size=(batch_size, latent_vector_dim))
        generator_input = torch.cat((true_labels, noise), dim=1)
        generated_data = generator(generator_input)
        generated_data_with_labels = torch.cat((true_labels, generated_data), dim=1)

        generator_loss = loss(discriminator(generated_data_with_labels), real_label)
        generator_loss.backward()
        generator_optimizer.step()

        discriminated_real_data = discriminator(true_labels_data)
        discriminated_generated_data = discriminator(generated_data_with_labels.detach().clone())

        discriminated_real_data_loss = loss(discriminated_real_data, real_label)
        discriminated_generated_data_loss = loss(discriminated_generated_data, fake_label)
        total_loss = (discriminated_real_data_loss + discriminated_generated_data_loss)
        total_loss.backward()
        print(epoch, total_loss)
        discriminator_optimizer.step()
    return generator


generator = train()

brand generator weights tensor([ 0.0912, -1.0191,  0.0592,  0.5726,  1.6697,  0.1411, -0.0748, -0.7189,
         0.2919,  0.7478], grad_fn=<SelectBackward0>)
gearbox generator weights tensor([ 0.3213, -0.1936, -0.0199,  0.1620, -1.6089,  0.7501,  0.5640, -0.1063,
        -0.0472, -0.3342], grad_fn=<SelectBackward0>)
brand discriminator weights tensor([ 0.0912, -1.0191,  0.0592,  0.5726,  1.6697,  0.1411, -0.0748, -0.7189,
         0.2919,  0.7478], grad_fn=<SelectBackward0>)
gearbox discriminator weights tensor([ 0.3213, -0.1936, -0.0199,  0.1620, -1.6089,  0.7501,  0.5640, -0.1063,
        -0.0472, -0.3342], grad_fn=<SelectBackward0>)
0 tensor(1.4016, grad_fn=<AddBackward0>)
1 tensor(1.4008, grad_fn=<AddBackward0>)
2 tensor(1.3999, grad_fn=<AddBackward0>)
3 tensor(1.3987, grad_fn=<AddBackward0>)
4 tensor(1.3980, grad_fn=<AddBackward0>)
5 tensor(1.3962, grad_fn=<AddBackward0>)
6 tensor(1.3951, grad_fn=<AddBackward0>)
7 tensor(1.3936, grad_fn=<AddBackward0>)
8 tensor(1.3924, grad_fn=<Ad

Now let's generate 20 made-up cars that achieves 12 km/L city and 18 km/L highway

In [73]:
generated_cars = 20

want_city = 12
want_highway = 18
want_city_normalized = (want_city - fuelcons_min) / (fuelcons_max - fuelcons_min)
want_hwy_normalized = (want_highway - fuelcons_min) / (fuelcons_max - fuelcons_min)
want_city_normalized = torch.tensor([want_city_normalized] * generated_cars).reshape(-1, 1).float()
want_hwy_normalized = torch.tensor([want_hwy_normalized] * generated_cars).reshape(-1, 1).float()
noise = torch.rand(size=(generated_cars, latent_vector_dim))
combined = torch.cat((want_city_normalized, want_hwy_normalized, noise), dim=1)

generated_data = generator(combined)
for car in generated_data:
    brand, gearbox, year, cc = car
    brand = all_brands_list[int(brand)]
    gearbox = all_gearboxes_list[int(gearbox)]
    cc = int((cc * (cc_max - cc_min)) + cc_min)
    year = int((year * (year_max - year_min)) + year_min)
    print(year, brand, cc, gearbox, "achieves", want_city, "city and", want_highway, "highway")

2021 peugeot 916 CVT achieves 12 city and 18 highway
2021 peugeot 894 CVT achieves 12 city and 18 highway
2021 peugeot 896 CVT achieves 12 city and 18 highway
2021 peugeot 922 CVT achieves 12 city and 18 highway
2021 suzuki 910 CVT achieves 12 city and 18 highway
2021 suzuki 916 CVT achieves 12 city and 18 highway
2021 suzuki 909 CVT achieves 12 city and 18 highway
2021 peugeot 903 CVT achieves 12 city and 18 highway
2021 peugeot 901 CVT achieves 12 city and 18 highway
2021 peugeot 888 CVT achieves 12 city and 18 highway
2021 peugeot 916 CVT achieves 12 city and 18 highway
2021 suzuki 913 CVT achieves 12 city and 18 highway
2021 suzuki 909 CVT achieves 12 city and 18 highway
2021 peugeot 910 CVT achieves 12 city and 18 highway
2021 peugeot 888 CVT achieves 12 city and 18 highway
2021 peugeot 917 CVT achieves 12 city and 18 highway
2021 suzuki 907 CVT achieves 12 city and 18 highway
2021 peugeot 906 CVT achieves 12 city and 18 highway
2021 peugeot 895 CVT achieves 12 city and 18 highway