In [60]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

In [101]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [61]:
data = pd.read_csv('dataset/data.csv')
#data.head(10)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Brand          8128 non-null   object 
 1   Year           8128 non-null   int64  
 2   Fuel           8128 non-null   object 
 3   Transmission   8128 non-null   object 
 4   Engine         7907 non-null   object 
 5   Max_Power      7913 non-null   object 
 6   Torque         7906 non-null   object 
 7   Seats          7907 non-null   float64
 8   Mileage        7907 non-null   object 
 9   KM_Driven      8128 non-null   int64  
 10  Owner          8128 non-null   object 
 11  Seller_Type    8128 non-null   object 
 12  Selling_Price  8128 non-null   int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [None]:
data.isna().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numeric_columns = ['KM_Driven', 'Seats', 'Selling_Price']

fig, axes = plt.subplots(1, 3, figsize=(15, 4)) 

axes = axes.flatten()

for i, column in enumerate(numeric_columns):
    sns.histplot(data[column], kde=True, bins=30, color='blue', ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Frequency')


for j in range(len(numeric_columns), len(axes)):
    axes[j].set_visible(False)


plt.tight_layout()
plt.show()


## processing and feature eng


In [62]:

data = data.dropna()
data.columns = data.columns.str.lower()
data = data.map(lambda x: x.lower() if isinstance(x, str) else x)

data["age"] = 2024 - data["year"]
data["producer"] = data["brand"].apply(lambda x : x.split()[0])
data["engine_cc"] = data["engine"].apply(lambda x : x.split()[0])
data['mileage_kmpl'] = data['mileage'].apply(lambda x : x.split()[0])
data['max_power_bhp'] = data['max_power'].apply(lambda x : x.split()[0])

def convert_seats_to_size(x):
    if x >= 0 and x <=3:
        return 'small'
    elif x >=4 and x <=6:
        return 'medium'
    elif x>=7 and x <=9:
        return 'large'
    else:
        return 'extra_large'

data['size'] = data['seats'].apply(lambda x : convert_seats_to_size(x))

In [63]:
# encoding_cat_data
trans_encoder = LabelEncoder()
data['transmission_encoded'] = trans_encoder.fit_transform(data["transmission"])
train_label_id = dict(zip(trans_encoder.classes_, range(len(trans_encoder.classes_))))

owner_encoder = LabelEncoder()
data['owner_encoded'] = owner_encoder.fit_transform(data["owner"])
owner_label_id = dict(zip(owner_encoder.classes_, range(len(owner_encoder.classes_))))

seller_type_encoder = LabelEncoder()
data["seller_type_encode"] = seller_type_encoder.fit_transform(data["seller_type"])
seller_type_id = dict(zip(seller_type_encoder.classes_, range(len(seller_type_encoder.classes_))))

fuel_endocer = LabelEncoder()
data["fuel_encoded"] = fuel_endocer.fit_transform(data["fuel"])
fuel_id = dict(zip(fuel_endocer.classes_, range(len(fuel_endocer.classes_))))

size_encoder = LabelEncoder()
data['size_encoded'] = size_encoder.fit_transform(data['size'])
size_id = dict(zip(size_encoder.classes_, range(len(size_encoder.classes_))))

producer_encoder = LabelEncoder()
data['producer_encoded'] = producer_encoder.fit_transform(data['producer'])
producer_id = dict(zip(producer_encoder.classes_, range(len(producer_encoder.classes_))))

"""
def encode(encoder, col):
    encoded_col = encoder.fit_transform(col)
    id = dict(zip(encoder.classes_, range(len(encoder.classes_))))

    return encoded_col, id

trans_encoder = LabelEncoder()
trans_encoded, tran_id = encode(trans_encoder, data['transmission'])
data['tras_en_test'] = trans_encoded
"""


"\ndef encode(encoder, col):\n    encoded_col = encoder.fit_transform(col)\n    id = dict(zip(encoder.classes_, range(len(encoder.classes_))))\n\n    return encoded_col, id\n\ntrans_encoder = LabelEncoder()\ntrans_encoded, tran_id = encode(trans_encoder, data['transmission'])\ndata['tras_en_test'] = trans_encoded\n"

In [64]:
col = ['brand', 'year', 'km_driven', 
       'selling_price', 'age', 'engine_cc', 'mileage_kmpl',
       'max_power_bhp', 'transmission_encoded', 'owner_encoded',
       'seller_type_encode', 'fuel_encoded', 'size_encoded',
       'producer_encoded']

data = data[col]

y = data['selling_price']
X = data.drop('selling_price', axis=1)

In [65]:
X_no_brand = X.drop('brand', axis=1)

## tf-idf for brand name

In [6]:
# Apply TF-IDF on 'brand' column
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X['brand'])

# Convert the TF-IDF matrix to a dense array
tfidf_dense = tfidf_matrix.toarray()

# Specify the numeric columns to scale
numeric_columns = ['year', 'km_driven', 'age', 'engine_cc', 'mileage_kmpl',
                   'max_power_bhp', 'transmission_encoded', 'owner_encoded',
                   'seller_type_encode', 'fuel_encoded', 'size_encoded',
                   'producer_encoded']

# Extract the numeric features
numeric_features = X[numeric_columns]

# Apply StandardScaler to the numeric features
std_scaler = StandardScaler()
scaled_numeric_features = std_scaler.fit_transform(numeric_features)

# Convert the TF-IDF dense matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_dense, columns=vectorizer.get_feature_names_out())

# Combine the scaled numeric features with the TF-IDF DataFrame
X_features = pd.concat([
    pd.DataFrame(scaled_numeric_features, columns=numeric_columns),
    tfidf_df
], axis=1)

In [7]:
# saving sclaer and vectorizor 
joblib.dump(std_scaler, 'scalers/std_scaler.pkl')
joblib.dump(vectorizer, 'scalers/tfidf_vertorizor.pkl')


['scalers/tfidf_vertorizor.pkl']

In [71]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class DataPreparation:
    def __init__(self, data, target, train_size, test_size, val_size):
        self.target = target
        self.data = data
        self.train_size = train_size
        self.test_size = test_size
        self.val_size = val_size
        #self.scaler =  MinMaxScaler()
        self.scaler =  StandardScaler()
    
    def split_data(self):
        train_end = int(self.data.shape[0] * self.train_size)
        test_end = train_end + int(self.data.shape[0] * self.test_size)

        X_train, y_train = self.data[:train_end], self.target[:train_end]
        X_test, y_test = self.data[train_end:test_end], self.target[train_end:test_end]
        X_val, y_val = self.data[test_end:], self.target[test_end:]

        return X_train, X_test, X_val, y_train, y_test, y_val
    
    def prepare(self):
        X_train, X_test, X_val, y_train, y_test, y_val = self.split_data()
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)
        X_val = self.scaler.transform(X_val)

        return X_train, X_test, X_val, y_train, y_test, y_val




In [75]:
# split in train test val
prepare = DataPreparation(X_no_brand,y,.7,.10,.20)
X_train, X_test, X_val, y_train, y_test, y_val = prepare.prepare()

y_train = np.log1p(y_train)  # log1p applies log(1 + y), which handles 0 values safely
y_test = np.log1p(y_test)

#X_train, X_test, y_train, y_test = train_test_split(X_no_brand, y, test_size=.2, random_state=42)

no_brand_scaler = MinMaxScaler()
X_train = no_brand_scaler.fit_transform(X_train)
X_test = no_brand_scaler.transform(X_test)
X_val = no_brand_scaler.transform(X_val)

X_train_tensors = torch.tensor(X_train,dtype=torch.float32)
X_test_tensors = torch.tensor(X_test, dtype=torch.float32)
y_train_tensors = torch.tensor(y_train.to_numpy(), dtype=torch.float32).reshape(-1,1)
y_test_tensors = torch.tensor(y_test.to_numpy(), dtype=torch.float32).reshape(-1,1)

X_val_tensors = torch.tensor(X_val,dtype=torch.float32)
y_val_tensors = torch.tensor(y_val.to_numpy(), dtype=torch.float32).reshape(-1,1)


In [76]:
class MakeDataset(Dataset):
    def __init__(self, features, targets) -> None:
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):
        return self.features[index], self.targets[index]

train_tensors = MakeDataset(X_train_tensors, y_train_tensors)
test_tensors = MakeDataset(X_test_tensors, y_test_tensors)

train_loader = DataLoader(train_tensors, batch_size=64)
test_loader = DataLoader(test_tensors, batch_size=64)

In [77]:
# define netowork

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            #nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(),
            #nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            #nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, X):
        return self.network(X)


In [78]:
input_dim = X_test.shape[1]
model = Model(input_dim)

loss_func = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [79]:
epochs = 100

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for feature, target in train_loader:
        optimizer.zero_grad()
        output = model(feature)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for feature, target in test_loader:
            val_output = model(feature)
            v_loss = loss_func(val_output, target)
            val_loss += v_loss.item()
    val_loss /= len(test_loader)

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

Epoch 1/100, Training Loss: 25.7319, Validation Loss: 0.2715
Epoch 2/100, Training Loss: 0.1761, Validation Loss: 0.1434
Epoch 3/100, Training Loss: 0.1259, Validation Loss: 0.1191
Epoch 4/100, Training Loss: 0.1108, Validation Loss: 0.1057
Epoch 5/100, Training Loss: 0.1029, Validation Loss: 0.0973
Epoch 6/100, Training Loss: 0.0985, Validation Loss: 0.0919
Epoch 7/100, Training Loss: 0.0957, Validation Loss: 0.0907
Epoch 8/100, Training Loss: 0.0937, Validation Loss: 0.0909
Epoch 9/100, Training Loss: 0.0911, Validation Loss: 0.0849
Epoch 10/100, Training Loss: 0.0887, Validation Loss: 0.0790
Epoch 11/100, Training Loss: 0.0869, Validation Loss: 0.0783
Epoch 12/100, Training Loss: 0.0855, Validation Loss: 0.0865
Epoch 13/100, Training Loss: 0.0841, Validation Loss: 0.0990
Epoch 14/100, Training Loss: 0.0832, Validation Loss: 0.1057
Epoch 15/100, Training Loss: 0.0836, Validation Loss: 0.1030
Epoch 16/100, Training Loss: 0.0843, Validation Loss: 0.0932
Epoch 17/100, Training Loss: 0.0

In [82]:
model.eval()

with torch.no_grad():
    y_pred = model(X_val_tensors)


y_pred = np.expm1(y_pred.numpy()) 

In [97]:
y_pred

array([[ 380252.62],
       [1025058.75],
       [ 419833.4 ],
       ...,
       [ 274217.6 ],
       [ 377513.38],
       [ 377513.38]], dtype=float32)

In [99]:
predict_df = pd.DataFrame({
    'y_true' : y_val,
    'y_pred' : y_pred.squeeze()
})

predict_df.head(10)


predict_df['off'] = predict_df['y_pred'] - predict_df['y_true']

In [102]:
predict_df.head(300)

Unnamed: 0,y_true,y_pred,off
6504,280000,380252.6,100252.6
6505,800000,1025059.0,225058.8
6506,220000,419833.4,199833.4
6507,620000,689644.2,69644.19
6508,300000,570521.9,270521.9
6509,180000,167656.2,-12343.84
6510,250000,300330.0,50329.97
6511,480000,656291.2,176291.2
6512,885000,1194994.0,309993.8
6513,550000,984550.1,434550.1
