In [15]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assuming you have a dataset with a 'text' column containing movie reviews
# Replace 'your_dataset.csv' with the actual file path
data = pd.read_csv('Product listing.csv')

# Data preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenization can be done using regex or libraries like NLTK or spaCy
    # Here, a simple split by space is used
    tokens = text.split()
    # Remove stopwords (you may need to download the stopwords list for your language)
    stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

data['clean_text'] = data['product'].apply(preprocess_text)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])

# Clustering with K-means
k = 5  # Number of clusters (you can adjust this)
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to each review
data['cluster_label'] = kmeans.labels_

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Display some reviews from each cluster
for cluster_id in range(k):
    cluster_samples = data[data['cluster_label'] == cluster_id].sample(5)  # Displaying 5 samples per cluster
    print(f"\nCluster {cluster_id}:")
    for index, row in cluster_samples.iterrows():
        print(row['product'])
        print('-' * 50)

# You can further analyze the clusters and refine the process as needed


Silhouette Score: 0.057004055728191866

Cluster 0:
Xifo EL y30 4G Volte Smartphone (3GB, 32GB) in Black
--------------------------------------------------
Logitech M90 Wired USB Mouse, 1000 DPI Optical Tracking, Ambidextrous PC/Mac/Laptop - Black
--------------------------------------------------
Redgear A-20 Wired Gaming Mouse with RGB and Upto 4800 dpi for Windows PC Gamers.
--------------------------------------------------
Adcom AD-12526 USB Wired 3D Optical Mouse (Black/Orange, Small Size)
--------------------------------------------------
Trend Micro Maximum Security 2020 - Global Version (Windows/Mac/Android/iOS) - 3 User, 1 Year (Email Delivery in 2 Hours - No CD)
--------------------------------------------------

Cluster 1:
Decdeal Mini 58mm Embedded Thermal Printer Module POS Receipt Ticket Barcode Printer Support ESC/POS Print Command with USB/RS232/TTL Interface
--------------------------------------------------
Redmi SonicBass Wireless Earphones with Dual-Mic Noise Cancel