In [58]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
import joblib
import pickle

In [59]:
# Load data
data = pd.read_csv('/content/latpopp.csv')
print(data)

      brand                                               name  \
0        HP                   Victus 15-fb0157AX Gaming Laptop   
1        HP                                15s-fq5007TU Laptop   
2      Acer                               One 14 Z8-415 Laptop   
3    Lenovo               Yoga Slim 6 14IAP8 82WU0095IN Laptop   
4     Apple                    MacBook Air 2020 MGND3HN Laptop   
..      ...                                                ...   
886    Asus                     TUF A15 FA577RM-HQ032WS Laptop   
887    Asus  ROG Zephyrus G14 2023 GA402XV-N2034WS Gaming L...   
888    Asus  TUF Gaming F15 2023 FX507VU-LP083WS Gaming Laptop   
889    Asus  TUF Gaming A15 2023 FA577XU-LP041WS Gaming Laptop   
890     NaN                                                NaN   

                         processor   Ram Ram_type  Storage Storage_type  \
0        5th Gen AMD Ryzen 5 5600H   8.0     DDR4    512.0          SSD   
1     12th Gen Intel Core i3 1215U   8.0     DDR4    512.

In [60]:
# Clean outliers
def clean_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Apply the function to the 'Price' column
data = clean_outliers_iqr(data, 'Price')

In [61]:
# Preprocess data
import json
import joblib
import pickle

def preprocess_data(data):
  numeric_features = ['Ram', 'Storage', 'display_size', 'resolution_width', 'resolution_height']
  categorical_features = ['processor','brand', 'Storage_type', 'GPU', 'OS']


  # Numeric Features Preprocess
  numeric_preprocessor = StandardScaler()
  X_numeric = numeric_preprocessor.fit_transform(data[numeric_features])
  X_numeric = pd.DataFrame(X_numeric, columns=numeric_features)



  # Categorical Features Preprocess
  categorical_preprocessors = {}
  X_categorical_encoded = {}

  # Process each categorical feature
  for feature in categorical_features:
      # TextVectorization layer
      vectorizer = tf.keras.layers.TextVectorization(
          output_mode='multi_hot',
          max_tokens=None,  # Adjust if needed based on your data
          standardize='lower_and_strip_punctuation'
      )

      vectorizer.adapt(data[feature].astype(str))

      categorical_preprocessors[feature] = vectorizer

      encoded = vectorizer(data[feature].astype(str))
      X_categorical_encoded[feature] = encoded.numpy()
      print(f"{feature} vocabulary size: {len(vectorizer.get_vocabulary())}")


  # Combine all features
  X_processed = X_numeric.copy()

  for feature, encoded_values in X_categorical_encoded.items():
      # Get vocabulary size for this feature
      vocab_size = len(categorical_preprocessors[feature].get_vocabulary())

      encoded_columns = [f"{feature}_{i}" for i in range(vocab_size)]


      # Convert to DataFrame and concatenate
      encoded_df = pd.DataFrame(encoded_values, columns=encoded_columns)
      X_processed = pd.concat([X_processed, encoded_df], axis=1)


  # Create train-test split
  y = data['Price']
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


  return X_train, X_test, y_train, y_test

In [62]:
X_train, X_test, y_train, y_test = preprocess_data(data)

X_train

processor vocabulary size: 138
brand vocabulary size: 30
Storage_type vocabulary size: 3
GPU vocabulary size: 66
OS vocabulary size: 12


Unnamed: 0,Ram,Storage,display_size,resolution_width,resolution_height,processor_0,processor_1,processor_2,processor_3,processor_4,...,OS_2,OS_3,OS_4,OS_5,OS_6,OS_7,OS_8,OS_9,OS_10,OS_11
774,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
526,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
655,0.797580,-0.279544,1.067015,-0.151373,-0.330371,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0
581,-0.909064,-0.279544,0.518533,-1.772220,-1.478619,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
380,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
106,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
270,0.797580,-0.279544,-1.236609,5.466003,4.527598,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0
435,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0


In [63]:
# Random Forest
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)
# y_pred_rf = model_rf.predict(X_test)
# mae_rf = mean_absolute_error(y_test, y_pred_rf)
# print("Random Forest Mean Absolute Error:", mae_rf)

0.7606257752216669

In [64]:
# Define quantiles
quantiles = [0.25, 0.5, 0.75]

In [65]:
from tensorflow.keras.regularizers import l2

qr = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dense(8, activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dense(8, activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dense(len(quantiles))

])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [66]:

# Custom quantile loss function
def quantile_loss(q, y_true, y_pred):
    # Cast y_true to float32 to match y_pred's type
    y_true = tf.cast(y_true, tf.float32)
    error = y_true - y_pred
    return tf.reduce_mean(tf.maximum(q * error, (q - 1) * error))
# Build a model for quantile regression
input_shape = X_train.shape[1]
# Custom training loop for multiple quantiles
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

@tf.function
def train_step(X, y):
    with tf.GradientTape() as tape:
        y_pred = qr(X, training=True)
        losses = [quantile_loss(q, y, y_pred[:, i]) for i, q in enumerate(quantiles)]
        total_loss = tf.reduce_sum(losses)
    gradients = tape.gradient(total_loss, qr.trainable_weights)
    optimizer.apply_gradients(zip(gradients, qr.trainable_weights))
    return total_loss

# Training loop
batch_size = 32
epochs = 100
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)

for epoch in range(epochs):
    for step, (X_batch, y_batch) in enumerate(dataset):
        loss = train_step(X_batch, y_batch)
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

# Predict intervals
y_pred = qr.predict(X_test)
intervals = pd.DataFrame(y_pred, columns=[f"Quantile {q}" for q in quantiles])
print(intervals)
qr.save('mbahlaptop_quantile_V0.1.h5')

Epoch 0, Loss: 16314006.0
Epoch 10, Loss: 1981589.875
Epoch 20, Loss: 1540376.75
Epoch 30, Loss: 1149276.125
Epoch 40, Loss: 935464.625
Epoch 50, Loss: 831258.5
Epoch 60, Loss: 787798.75
Epoch 70, Loss: 730185.375
Epoch 80, Loss: 688855.8125
Epoch 90, Loss: 601105.125
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step




     Quantile 0.25  Quantile 0.5  Quantile 0.75
0        6445123.0     6739229.0      7069937.0
1        9698063.0    10140609.0     10638230.0
2       14263419.0    14914303.0     15646173.0
3        6691398.0     6996742.0      7340086.5
4       13439815.0    14053108.0     14742724.0
..             ...           ...            ...
159     10822537.0    11316413.0     11871721.0
160     11947070.0    12492249.0     13105266.0
161     16811412.0    17578578.0     18441180.0
162     16425737.0    17175290.0     18018112.0
163     16338263.0    17083832.0     17922162.0

[164 rows x 3 columns]


In [67]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open('mbahlaptop_quantile_V0.1.tflite', 'wb') as f:
  f.write(tflite_model)

Saved artifact at '/tmp/tmpril0347n'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 254), dtype=tf.float32, name=None)
Output Type:
  List[TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)]
Captures:
  139305058165408: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057512336: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057515328: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057506176: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057516032: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057504064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057510576: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057504592: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139305057501776: TensorSpec(shape=(), dtype=tf.res