In [None]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
import joblib
import pickle
# name_of_module.__version__

In [None]:
# Load data
data = pd.read_csv('/content/latpopp.csv')
print(data)

      brand                                               name  \
0        HP                   Victus 15-fb0157AX Gaming Laptop   
1        HP                                15s-fq5007TU Laptop   
2      Acer                               One 14 Z8-415 Laptop   
3    Lenovo               Yoga Slim 6 14IAP8 82WU0095IN Laptop   
4     Apple                    MacBook Air 2020 MGND3HN Laptop   
..      ...                                                ...   
886    Asus                     TUF A15 FA577RM-HQ032WS Laptop   
887    Asus  ROG Zephyrus G14 2023 GA402XV-N2034WS Gaming L...   
888    Asus  TUF Gaming F15 2023 FX507VU-LP083WS Gaming Laptop   
889    Asus  TUF Gaming A15 2023 FA577XU-LP041WS Gaming Laptop   
890     NaN                                                NaN   

                         processor   Ram Ram_type  Storage Storage_type  \
0        5th Gen AMD Ryzen 5 5600H   8.0     DDR4    512.0          SSD   
1     12th Gen Intel Core i3 1215U   8.0     DDR4    512.

In [None]:
# Clean outliers
def clean_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Apply the function to the 'Price' column
data = clean_outliers_iqr(data, 'Price')

In [None]:
# def preprocess_data(data):
#     numeric_features = ['Ram', 'Storage', 'display_size', 'resolution_width', 'resolution_height']
#     categorical_features = ['processor', 'brand', 'Storage_type', 'GPU', 'OS']

#     # Scale numeric data
#     scaler = StandardScaler()
#     data[numeric_features] = scaler.fit_transform(data[numeric_features])

#     # TensorFlow text preprocessing for categorical features
#     def tf_categorical_vectorization(data, categorical_features):
#         layers = {}
#         for col in categorical_features:
#             vocab = data[col].unique()
#             layer = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token=None)
#             layers[col] = layer
#         return layers

#     cat_layers = tf_categorical_vectorization(data, categorical_features)

#     for col, layer in cat_layers.items():
#         data[col] = layer(data[col])

#     # Splitting features and target
#     X = data[numeric_features + categorical_features]
#     y = data['Price']
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     return X_train, X_test, y_train, y_test


In [None]:
# Preprocess data
import json
import joblib
import pickle

def preprocess_data(data):
  numeric_features = ['Ram', 'Storage', 'display_size', 'resolution_width', 'resolution_height']
  categorical_features = ['processor','brand', 'Storage_type', 'GPU', 'OS']


  # Numeric Features Preprocess
  numeric_preprocessor = StandardScaler()
  X_numeric = numeric_preprocessor.fit_transform(data[numeric_features])
  X_numeric = pd.DataFrame(X_numeric, columns=numeric_features)

  joblib.dump(numeric_preprocessor, 'numeric_preprocessor.pkl')


  # Categorical Features Preprocess
  categorical_preprocessors = {}
  X_categorical_encoded = {}

  # Process each categorical feature
  for feature in categorical_features:
      # TextVectorization layer
      vectorizer = tf.keras.layers.TextVectorization(
          output_mode='multi_hot',
          max_tokens=None,  # Adjust if needed based on your data
          standardize='lower_and_strip_punctuation'
      )

      vectorizer.adapt(data[feature].astype(str))

      categorical_preprocessors[feature] = vectorizer

      encoded = vectorizer(data[feature].astype(str))
      X_categorical_encoded[feature] = encoded.numpy()
      print(f"{feature} vocabulary size: {len(vectorizer.get_vocabulary())}")


  # Combine all features
  X_processed = X_numeric.copy()

  for feature, encoded_values in X_categorical_encoded.items():
      # Get vocabulary size for this feature
      vocab_size = len(categorical_preprocessors[feature].get_vocabulary())

      encoded_columns = [f"{feature}_{i}" for i in range(vocab_size)]


      # Convert to DataFrame and concatenate
      encoded_df = pd.DataFrame(encoded_values, columns=encoded_columns)
      X_processed = pd.concat([X_processed, encoded_df], axis=1)

  categorical_vocabularies = {}
  for feature, vectorizer in categorical_preprocessors.items():
      vocabulary = vectorizer.get_vocabulary()
      categorical_vocabularies[feature] = vocabulary
      # Save vocabulary to JSON for Android compatibility
      with open(f'{feature}_vocabulary.json', 'w') as f:
          json.dump(vocabulary, f)

  # Create train-test split
  y = data['Price']
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_data(data)

X_train

processor vocabulary size: 138
brand vocabulary size: 30
Storage_type vocabulary size: 3
GPU vocabulary size: 66
OS vocabulary size: 12


Unnamed: 0,Ram,Storage,display_size,resolution_width,resolution_height,processor_0,processor_1,processor_2,processor_3,processor_4,...,OS_2,OS_3,OS_4,OS_5,OS_6,OS_7,OS_8,OS_9,OS_10,OS_11
774,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
526,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
655,0.797580,-0.279544,1.067015,-0.151373,-0.330371,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0
581,-0.909064,-0.279544,0.518533,-1.772220,-1.478619,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
380,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
106,-0.909064,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
270,0.797580,-0.279544,-1.236609,5.466003,4.527598,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0
435,0.797580,-0.279544,0.518533,-0.151373,-0.330371,0,1,1,1,0,...,1,1,0,0,0,0,0,0,0,0


In [None]:
# Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
model_lr.score(X_test, y_test)
# y_pred = model_lr.predict(X_test)
# mae = mean_absolute_error(y_test, y_pred)
# print("Linear Regression Mean Absolute Error:", mae)

-1.3294388925090822e+21

In [None]:
# Random Forest
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)
# y_pred_rf = model_rf.predict(X_test)
# mae_rf = mean_absolute_error(y_test, y_pred_rf)
# print("Random Forest Mean Absolute Error:", mae_rf)

0.7524784468840988

In [None]:
# # Temp Quantile Regression
# class QuantileRegression(tf.keras.Model):
#     def __init__(self, num_quantiles):
#         super().__init__()
#         self.dense1 = tf.keras.layers.Dense(128, activation='relu')
#         self.dense2 = tf.keras.layers.Dense(64, activation='relu')
#         self.output_layer = tf.keras.layers.Dense(num_quantiles)

#     def call(self, inputs):
#         x = self.dense1(inputs)
#         x = self.dense2(x)
#         return self.output_layer(x)

# def quantile_loss(quantile):
#     def loss(y_true, y_pred):
#         error = y_true - y_pred
#         return tf.reduce_mean(tf.maximum(quantile * error, (quantile - 1) * error))
#     return loss


In [None]:
# Create model
from tensorflow.keras.regularizers import l2

def create_model():
  model = tf.keras.models.Sequential([])

  model.add(tf.keras.layers.Dense(64, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(8, activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(8, activation='relu', kernel_regularizer=l2(0.01)))
  model.add(tf.keras.layers.Dense(1, activation='linear'))

  return model

In [None]:
model = create_model()

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001), loss='huber', metrics=['mae'])

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Model Train
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=800, batch_size=32, callbacks=[early_stop])
# history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=800, batch_size=32)

Epoch 1/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 12311953.0000 - mae: 12311952.0000 - val_loss: 12066505.0000 - val_mae: 12066505.0000
Epoch 2/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12237143.0000 - mae: 12237143.0000 - val_loss: 12066498.0000 - val_mae: 12066498.0000
Epoch 3/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12457755.0000 - mae: 12457756.0000 - val_loss: 12066472.0000 - val_mae: 12066473.0000
Epoch 4/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12429850.0000 - mae: 12429850.0000 - val_loss: 12066398.0000 - val_mae: 12066398.0000
Epoch 5/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12159593.0000 - mae: 12159593.0000 - val_loss: 12066230.0000 - val_mae: 12066230.0000
Epoch 6/800
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12413547.00

In [None]:
model.save('mbahlaptop_V0.1.h5')



In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open('mbahlaptop.tflite', 'wb') as f:
  f.write(tflite_model)

Saved artifact at '/tmp/tmpbepj3d2x'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 254), dtype=tf.float32, name='keras_tensor_72')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  134279335676912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279335686240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279491161600: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279491169344: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279501239696: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279490976432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279517832288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279491676368: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279492476720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279490524032: TensorSpec(shape=(), dtype=tf.resource, name=None)
  134279525452432: Te