In [27]:
import pandas as pd
import numpy as np


file = 'cleaned_v2.csv'

df = pd.read_csv(file)

# Step 1: Extract individual options from the 'options' column
def extract_options(option_string, option_type):
    if isinstance(option_string, str):  # Check if the option_string is a valid string
        start = option_string.find(option_type)
        if start != -1:
            start_index = start + len(option_type) + 2  # Adjust to start after ": " 
            end_index = option_string.find("\n", start_index)  # Find the end of this section
            if end_index == -1:
                end_index = len(option_string)  # Handle the case if it's the last section
            options = option_string[start_index:end_index].strip()
            return options
    return None

# Apply the function to extract options and create new columns
df['audio_technology'] = df['options'].apply(lambda x: extract_options(x, 'audio si tehnologie'))
df['comfort_equipment'] = df['options'].apply(lambda x: extract_options(x, 'confort si echipamente optionale'))
df['electronics_assistance'] = df['options'].apply(lambda x: extract_options(x, 'electronice si sisteme de asistenta'))
df['performance'] = df['options'].apply(lambda x: extract_options(x, 'performanta'))
df['safety'] = df['options'].apply(lambda x: extract_options(x, 'siguranta'))

# Step 2: Drop the original 'options' column
df = df.drop(columns=['options'])

# Step 3: Handle missing values in the new option columns by filling with 'None' (or any other placeholder)
df['audio_technology'] = df['audio_technology'].fillna('None')
df['comfort_equipment'] = df['comfort_equipment'].fillna('None')
df['electronics_assistance'] = df['electronics_assistance'].fillna('None')
df['performance'] = df['performance'].fillna('None')
df['safety'] = df['safety'].fillna('None')


df.head()

Unnamed: 0,manufacturer,model,year,km,fuel,power,engine_capacity,transmission,automatic,chassis,country of origin,description,price,currency,id,audio_technology,comfort_equipment,electronics_assistance,performance,safety
0,renault,talisman,2016.0,158271,diesel,160,1598,fata,True,sedan,franta,renault talisman 1.6 dci energy intens - 160c...,17490,eur,8,"apple carplay,android auto,bluetooth,radio,sis...","carlig remorcare,climatronic,jaluzele manuale ...","pilot automat,senzori parcare fata,senzori par...","jante aliaj 19,anvelope vara","abs,esp,franare asistata,airbag sofer,airbag s..."
1,mercedes-benz,e,2006.0,73867,diesel,190,2987,spate,True,sedan,italia,- mercedes-benz e-class e280 cdi - dric/limuzi...,23969,eur,10,"bluetooth,radio,sistem hands-free","climatronic,tapiterie stofa,scaun sofer ajusta...","pilot automat,oglinzi exterioare incalzite,lim...","jante aliaj 16,suspensie confort","abs,esp,franare asistata,airbag sofer,airbag s..."
2,skoda,octavia,2019.0,178856,diesel,183,1968,4x4 (automat),True,combi,romania,garantie 24 luni in limita a 30.000 km\n\nposi...,16789,eur,11,"apple carplay,android auto,bluetooth,radio,sis...","climatronic 2 zone,tapiterie piele,incalzire s...","pilot automat,senzori parcare fata,senzori par...",jante aliaj 17,"abs,esp,franare asistata,airbag sofer,airbag s..."
3,renault,clio,2016.0,228000,diesel,90,1461,fata,False,combi,,"renault clio iv limited\nmotor: 1.5 dci , 90 c...",7890,eur,12,,"volan reglabil electric,geamuri electrice fata","pilot automat adaptiv (distronic),faruri led,o...",jante otel,"abs,esp,airbag sofer,airbag scaun pasager,airb..."
4,ford,kuga,2012.0,216000,diesel,163,1997,4x4 (automat),True,suv,olanda,ford kuga \nfab- 2012\nmotor diesel - 163cp\na...,8700,eur,13,"bluetooth,radio,port usb,sistem navigatie,sist...","carlig remorcare,climatronic,plafon panoramic,...","pilot automat,faruri xenon,senzori parcare fat...",,"abs,esp,ebd,franare asistata,sistem avertizare..."


In [28]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [29]:
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # Use dynamic input size
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    

# Drop country of origin, description and currency columns
df = df.drop(columns=['description'])
df = df.drop(columns=['country of origin'])
df = df.drop(columns=['currency'])



In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Separate categorical and numerical columns
categorical_columns = ['fuel', 'transmission', 'chassis', 'manufacturer', 'model', 
                       'audio_technology', 'comfort_equipment', 'electronics_assistance', 
                       'performance', 'safety', 'automatic']
numerical_columns = ['power', 'engine_capacity', 'km', 'year']

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Normalize numerical columns using StandardScaler
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Separate features (X) and target (y)
X = df.drop(columns=['price', 'id'])  # Features


y = df['price']  # Target

# Split into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train, 30% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% val, 15% test

# Scale features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Define the MLP architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)  # Adjust input features based on X_train.shape[1]
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)  # Output layer for regression
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 900
batch_size = 128
train_losses = []
val_losses = []

for epoch in range(epochs):
    # Training
    model.train()
    permutation = torch.randperm(X_train.size(0))
    train_loss = 0.0
    
    for i in range(0, X_train.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_X, batch_y = X_train[indices], y_train[indices]
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_losses.append(train_loss / (len(X_train) // batch_size))
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
        val_losses.append(val_loss.item())
    
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Evaluate on test data
model.eval()
with torch.no_grad():
    y_test_pred = model(X_test)
    test_loss = criterion(y_test_pred, y_test)
    y_test_pred = y_test_pred.numpy()
    y_test_actual = y_test.numpy()

# Calculate evaluation metrics
mse = mean_squared_error(y_test_actual, y_test_pred)
mae = mean_absolute_error(y_test_actual, y_test_pred)
r2 = r2_score(y_test_actual, y_test_pred)

print(f"Test MSE: {mse}")
print(f"Test MAE: {mae}")
print(f"Test R²: {r2}")


Epoch 1/900, Train Loss: 29678210528.0000, Val Loss: 295360000.0000
Epoch 2/900, Train Loss: 28949313472.0000, Val Loss: 279123968.0000
Epoch 3/900, Train Loss: 25705989056.0000, Val Loss: 228897712.0000
Epoch 4/900, Train Loss: 18909864896.0000, Val Loss: 149159552.0000
Epoch 5/900, Train Loss: 11191127528.0000, Val Loss: 83263600.0000
Epoch 6/900, Train Loss: 6635416648.0000, Val Loss: 54847708.0000
Epoch 7/900, Train Loss: 4901728108.0000, Val Loss: 43355980.0000
Epoch 8/900, Train Loss: 4098000764.0000, Val Loss: 36258472.0000
Epoch 9/900, Train Loss: 3550751794.0000, Val Loss: 31164268.0000
Epoch 10/900, Train Loss: 3140357326.0000, Val Loss: 27266684.0000
Epoch 11/900, Train Loss: 2807301260.0000, Val Loss: 24221652.0000
Epoch 12/900, Train Loss: 2580945621.0000, Val Loss: 21912060.0000
Epoch 13/900, Train Loss: 2371289257.0000, Val Loss: 20067770.0000
Epoch 14/900, Train Loss: 2208635297.0000, Val Loss: 18589126.0000
Epoch 15/900, Train Loss: 2046742486.0000, Val Loss: 17466932.