In [None]:
import pandas as pd
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np



In [None]:
# Load the dataset
file_path = 'flipkart_smartphones.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,brand,model,colour,original_price,discounted_price,ratings,rating_count,reviews,memory,storage,processor,rear_camera,front_camera,display_size,battery_capacity,battery_type
0,VIVO,VIVO T1 44W,Starry Sky,19990,14499,4.5,87331,6044,4.0,128.0,Qualcomm Snapdragon 680,50MP + 2MP + 2MP,16MP,16.36,5000.0,Lithium
1,APPLE,APPLE IPHONE 11,White,48900,47199,4.6,184191,10818,,128.0,A Bionic Chip,12MP + 12MP,12MP,15.49,,
2,VIVO,VIVO T1 44W,Midnight Galaxy,20990,15999,4.4,51365,3750,6.0,128.0,Qualcomm Snapdragon 680,50MP + 2MP + 2MP,16MP,16.36,5000.0,Lithium
3,XIAOMI,POCO M4 5G,Power Black,15999,11999,4.2,53448,4185,4.0,64.0,Mediatek Dimensity 700,50MP + 2MP,8MP,16.71,5000.0,Lithium Polymer
4,XIAOMI,REDMI 10,Caribbean Green,14999,9299,4.3,187787,12084,4.0,64.0,Qualcomm Snapdragon 680,50MP + 2MP,5MP,17.02,6000.0,Lithium Polymer


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836 entries, 0 to 835
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             836 non-null    object 
 1   model             836 non-null    object 
 2   colour            831 non-null    object 
 3   original_price    836 non-null    int64  
 4   discounted_price  836 non-null    int64  
 5   ratings           836 non-null    float64
 6   rating_count      836 non-null    int64  
 7   reviews           836 non-null    int64  
 8   memory            757 non-null    float64
 9   storage           820 non-null    float64
 10  processor         698 non-null    object 
 11  rear_camera       836 non-null    object 
 12  front_camera      707 non-null    object 
 13  display_size      836 non-null    float64
 14  battery_capacity  778 non-null    float64
 15  battery_type      477 non-null    object 
dtypes: float64(5), int64(4), object(7)
memory us

In [None]:
# Fill missing values for numerical columns with the median
numerical_columns = ['memory', 'storage', 'battery_capacity']
for column in numerical_columns:
    data[column].fillna(data[column].median(), inplace=True)

# Fill missing values for categorical columns with the mode
categorical_columns = ['colour', 'processor', 'front_camera', 'battery_type']
for column in categorical_columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Exchange rate from INR to Euro
exchange_rate = 0.012

# Convert the price from INR to Euro
data['price_euro'] = data['original_price'] * exchange_rate
data['discounted_price_euro'] = data['discounted_price'] * exchange_rate

# Normalize popularity features using Min-Max scaling
scaler = MinMaxScaler()
data[['normalized_review_count', 'normalized_rating_count']] = scaler.fit_transform(data[['reviews', 'rating_count']])

data.head()

Unnamed: 0,brand,model,colour,original_price,discounted_price,ratings,rating_count,reviews,memory,storage,processor,rear_camera,front_camera,display_size,battery_capacity,battery_type,price_euro,discounted_price_euro,normalized_review_count,normalized_rating_count
0,VIVO,VIVO T1 44W,Starry Sky,19990,14499,4.5,87331,6044,4.0,128.0,Qualcomm Snapdragon 680,50MP + 2MP + 2MP,16MP,16.36,5000.0,Lithium,239.88,173.988,0.049358,0.074533
1,APPLE,APPLE IPHONE 11,White,48900,47199,4.6,184191,10818,6.0,128.0,A Bionic Chip,12MP + 12MP,12MP,15.49,5000.0,Lithium,586.8,566.388,0.088344,0.157199
2,VIVO,VIVO T1 44W,Midnight Galaxy,20990,15999,4.4,51365,3750,6.0,128.0,Qualcomm Snapdragon 680,50MP + 2MP + 2MP,16MP,16.36,5000.0,Lithium,251.88,191.988,0.030624,0.043838
3,XIAOMI,POCO M4 5G,Power Black,15999,11999,4.2,53448,4185,4.0,64.0,Mediatek Dimensity 700,50MP + 2MP,8MP,16.71,5000.0,Lithium Polymer,191.988,143.988,0.034176,0.045616
4,XIAOMI,REDMI 10,Caribbean Green,14999,9299,4.3,187787,12084,4.0,64.0,Qualcomm Snapdragon 680,50MP + 2MP,5MP,17.02,6000.0,Lithium Polymer,179.988,111.588,0.098683,0.160268


In [None]:
# Function for text normalization
def normalize_text(text):
    return " ".join(text.lower().strip().split())

# Normalize and combine relevant text columns for embedding with labels and units
data['combined_features'] = data.apply(lambda row: normalize_text(
    f"Brand: {row['brand']}, this is the brand of the smartphone. "
    f"Model: {row['model']}, this is the model of the smartphone. "
    f"Color: {row['colour']}, the color of the smartphone is {row['colour']}. "
    f"Processor: {row['processor']}, which powers the smartphone. "
    f"Rear Camera: {row['rear_camera']} MP, the resolution of the rear camera. "
    f"Front Camera: {row['front_camera']} MP, the resolution of the front camera. "
    f"Display Size: {row['display_size']} inches, this is the size of the smartphone display. "
    f"Memory: {row['memory']} GB, the amount of RAM. "
    f"Storage: {row['storage']} GB, the internal storage capacity. "
    f"Battery Capacity: {row['battery_capacity']} mAh, this is the battery capacity. "
    f"Battery Type: {row['battery_type']}, the type of battery used. "
    f"Price: {row['price_euro']:.2f} Euro, this is the price in Euro. "
    f"Ratings: {row['ratings']}, the average user rating. "
    f"Rating Count: {row['rating_count']} reviews, the number of user ratings. "
    f"Review Count: {row['reviews']} reviews, the number of user reviews."
), axis=1)


# Load the BERT model for generating embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate embeddings for the combined text features
embeddings = model.encode(data['combined_features'].tolist())

# Save the embeddings to a file if needed
np.save('smartphone_embeddings.npy', embeddings)

# Display the shape of the embeddings to verify
print(embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(836, 768)
