In [5]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) & set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [8]:
df_amazon = pd.read_csv('data/amazon.csv')
df_amazon.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [9]:
df_amazon = df_amazon.rename(columns={'product_id': 'product_id', 'product_name': 'product_name', 'about_product': 'product_description'})
df_amazon.head(2)

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,product_description,user_id,user_name,review_id,review_title,review_content,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,43%,4.0,43994,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...


In [10]:
df_nike = pd.read_csv('data/NikeProductDescriptions.csv')
df_nike['product_id'] = range(1, len(df_nike) + 1)
df_nike.columns

Index(['Title', 'Subtitle', 'Product Description', 'product_id'], dtype='object')

In [11]:
df_nike = df_nike.rename(columns={'Title': 'product_name', 'Product Description': 'product_description'})
df_nike.head(2)

Unnamed: 0,product_name,Subtitle,product_description,product_id
0,Nike Air Force 1 '07,Men's Shoes,It doesn't get more legendary than this. Desig...,1
1,Nike Air Max Dawn SE,Men's Shoes,Find out what moves you with the Air Max Dawn....,2


In [12]:
important_columns = ['product_id','product_name','product_description']
dfs = [df_amazon[important_columns], df_nike[important_columns]]
all_data = pd.concat(dfs, ignore_index=True)
all_data

Unnamed: 0,product_id,product_name,product_description
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,"Compatible with all Type C enabled devices, be..."
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,【 Fast Charger& Data Sync】-With built-in safet...
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,The boAt Deuce USB 300 2 in 1 cable is compati...
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,[CHARGE & SYNC FUNCTION]- This cable comes wit...
...,...,...,...
1860,396,Nike Yoga Dri-FIT ADV,"Enjoy the soft, breathable experience of the N..."
1861,397,Nike Epic Fast,"Keep running, with the Nike Epic Fast Mid-rise..."
1862,398,Nike Sportswear A.I.R. Essential,We teamed up with Chicago-based artist Cody Hu...
1863,399,Nike ACG Dri-FIT One,Psst! Let's go and find that hidden treasure! ...


In [13]:
all_data.describe()

Unnamed: 0,product_id,product_name,product_description
count,1865,1865,1865
unique,1751,1591,1693
top,B07JW9H4J1,Nike Sportswear,[CHARGE & SYNC FUNCTION]- This cable comes wit...
freq,3,41,6


In [14]:
all_data = all_data.drop_duplicates(['product_name'])
all_data.describe()

Unnamed: 0,product_id,product_name,product_description
count,1591,1591,1591
unique,1591,1591,1538
top,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Bluetooth Calling- Wave Call comes with a prem...
freq,1,1,4


In [15]:
all_data_numpy=all_data.sample(5).to_numpy()

for ind,name, desc in all_data_numpy:
    print(ind)

B0BMGB3CH9
B095X38CJS
B07QMRHWJD
B083GKDRKR
B0B6F8HHR6


In [16]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s.!?-]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [23]:
all_data.loc[:,'cleaned_description'] = all_data['product_description'].apply(preprocess_text)

In [18]:
all_data.sample(5)

Unnamed: 0,product_id,product_name,product_description,cleaned_description
526,B0B2DJ5RVQ,WeCool B1 Mobile Holder for Bikes or Bike Mobi...,Mobile Holder for Bike is an innovative soluti...,mobile holder for bike is an innovative soluti...
827,B09RKFBCV7,"Fire-Boltt Ninja Calling 1.69"" Bluetooth Calli...",Fire-Boltt is India' No 1 Wearable Watch Brand...,fire-boltt is india no 1 wearable watch brand ...
1419,B09FPP3R1D,"Glen 3 in 1 Electric Multi Cooker - Steam, Coo...",A compact multifunction appliance perfect for ...,a compact multifunction appliance perfect for ...
678,B08QJJCY2Q,Tizum Mouse Pad/ Computer Mouse Mat with Anti-...,9.4 Inches X 7.9 Inches) 240mm x 200mm x 2mm S...,9.4 inches x 7.9 inches 240mm x 200mm x 2mm si...
1201,B0BQ3K23Y1,"Oratech Coffee Frother electric, milk frother ...",-Make delicious milk foam creamer for your dri...,-make delicious milk foam creamer for your dri...


In [19]:
names = all_data['product_name'].apply(preprocess_text).tolist()
embeddings_names = model.encode(names, convert_to_tensor=True)

In [20]:
cosine_sim = cosine_similarity(embeddings_names, embeddings_names)

In [24]:
from sklearn.cluster import KMeans
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings_names)

all_data.loc[:,'cluster'] = kmeans.labels_

for cluster_id in range(num_clusters):
    print(f"Кластер {cluster_id}:")
    cluster_titles = all_data[all_data['cluster'] == cluster_id]['product_name']
    for title in cluster_titles.head(10):
        print(f"  - {title}")
    print("\n")

Кластер 0:
  - MI 80 cm (32 inches) 5A Series HD Ready Smart Android LED TV L32M7-5AIN (Black)
  - LG 80 cm (32 inches) HD Ready Smart LED TV 32LM563BPTC (Dark Iron Gray)
  - Samsung 80 cm (32 Inches) Wondertainment Series HD Ready LED Smart TV UA32T4340BKXXL (Glossy Black)
  - Acer 80 cm (32 inches) I Series HD Ready Android Smart LED TV AR32AR2841HDFL (Black)
  - OnePlus 80 cm (32 inches) Y Series HD Ready LED Smart Android TV 32Y1 (Black)
  - OnePlus 126 cm (50 inches) Y Series 4K Ultra HD Smart Android LED TV 50Y1S Pro (Black)
  - Mi 108 cm (43 inches) Full HD Android LED TV 4C | L43M6-INC (Black)
  - VW 80 cm (32 inches) Frameless Series HD Ready LED TV VW32A (Black)
  - OnePlus 80 cm (32 inches) Y Series HD Ready Smart Android LED TV 32 Y1S (Black)
  - Samsung 108 cm (43 inches) Crystal 4K Neo Series Ultra HD Smart LED TV UA43AUE65AKXXL (Black)


Кластер 1:
  - Nike Air Force 1 '07
  - Nike Air Max Dawn SE
  - Nike SB Dunk Low Pro Premium
  - Nike Air Force 1 Mid '07 LX
  - Nike 

In [22]:
all_data[['product_id',	'product_name',	'product_description', 'cleaned_description']].to_csv('data/concat_data.csv', index=False)