In [None]:
!pip install sentence_transformers textstat

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat, sentence_transformers
Successfully installed pyphen-0.16.0 sentence_transformers-3.1.1 textstat-0.7.4


In [None]:
import pandas as pd
df= pd.read_csv('orgsupp (1).csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,product_name,rating,product_review
0,Mag-G Tablets Magnesium Gluconate Dietary Supp...,5,This order came very promptly and there were n...
1,Mag-G Tablets Magnesium Gluconate Dietary Supp...,5,"Very pleased with this product,received on tim..."
2,ACTIF Bariatric Probiotic Maximum Strength,5,I am new to this but I love love their aminos ...
3,ACTIF Bariatric Probiotic Maximum Strength,5,This product works wonders and tastes amazing ...
4,ACTIF Bariatric Probiotic Maximum Strength,5,Lo recomiendo 100% ! Todos los sabores me han ...


In [None]:
df2 = pd.read_csv('gensupp.csv')
df2.rename(columns={'Review': 'product_review','Product Name': 'product_name','Rating':'rating'}, inplace=True)
df2.head()

Unnamed: 0,product_name,rating,product_review
0,Chewdia The Hoodia Gum,4,This gum is a game-changer! I've been trying t...
1,Major Vitamin D3 2000 IU,2,"Okay, here are two new reviews reflecting a 2-..."
2,Preventix Liposomal Vitamin D3 K2,4,Noticed a difference in my energy levels since...
3,Super Brain Complex,3,"It's not bad, but it's not a miracle worker ei..."
4,Micellized Vitamin A,4,This vitamin A is easy to take and I’ve notice...


In [None]:
merged_df = pd.merge(df, df2, on=['product_name', 'rating'], suffixes=('_df', '_df2')) # 362

In [None]:
merged_df.rename(columns={'product_review_df': 'product_review', 'product_review_df2': 'synthetic_review'}, inplace=True)

In [None]:
import pandas as pd
df = pd.read_csv('rev3.csv')
df['synthetic_review'] = df['product_review']

In [None]:
import pandas as pd
import textstat
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to compute sentence length (number of words)
def compute_sentence_length(df, text_column):
    df['sentence_length'] = df[text_column].apply(lambda x: len(str(x).split()))
    return df

# Function to compute Flesch-Kincaid readability score
def compute_readability(df, text_column):
    df['readability_score'] = df[text_column].apply(lambda x: textstat.flesch_reading_ease(str(x)))
    return df

# Function to compute semantic similarity between original and synthetic texts
def compute_semantic_similarity(df, original_text_column, synthetic_text_column):
    original_embeddings = model.encode(df[original_text_column].tolist(), convert_to_tensor=True)
    synthetic_embeddings = model.encode(df[synthetic_text_column].tolist(), convert_to_tensor=True)
    similarities = cosine_similarity(original_embeddings.cpu(), synthetic_embeddings.cpu())
    df['semantic_similarity'] = similarities.diagonal()
    return df

# Function to compute statistics for each product_name-rating pair and compare
def evaluate_and_compare_datasets(df, product_name_col, rating_col, review_col, synthetic_review_col):
    # Group by product_name and rating
    grouped = df.groupby([product_name_col, rating_col])

    results = []

    # Iterate through each group and compute statistics
    for (product_name, rating), group in grouped:
        # Compute stats for original reviews
        group_original = compute_sentence_length(group.copy(), review_col)
        group_original = compute_readability(group_original, review_col)

        # Compute stats for synthetic reviews
        group_synthetic = compute_sentence_length(group.copy(), synthetic_review_col)
        group_synthetic = compute_readability(group_synthetic, synthetic_review_col)

        # Compute semantic similarity
        group = compute_semantic_similarity(group, review_col, synthetic_review_col)

        # Collecting stats for each product_name-rating pair
        stats = {
            'product_name': product_name,
            'rating': rating,
            'original_avg_sentence_length': group_original['sentence_length'].mean(),
            'synthetic_avg_sentence_length': group_synthetic['sentence_length'].mean(),
            'original_avg_readability_score': group_original['readability_score'].mean(),
            'synthetic_avg_readability_score': group_synthetic['readability_score'].mean(),
            'avg_semantic_similarity': group['semantic_similarity'].mean()
        }
        results.append(stats)

    # Return results as a DataFrame
    return pd.DataFrame(results)

# Example usage
# Assuming the dataframe has columns: 'product_name', 'rating', 'product_review', 'synthetic_review'
# df = pd.DataFrame({
#    'product_name': ['Product A', 'Product A', 'Product B', 'Product B'],
#    'rating': [5, 5, 3, 3],
#    'product_review': ['This product is excellent!', 'Loved it!', 'It’s okay.', 'Not bad.'],
#    'synthetic_review': ['This is a great product!', 'Really liked it!', 'It’s fine.', 'Good enough.']
# })
# result_df = evaluate_and_compare_datasets(df, 'product_name', 'rating', 'product_review', 'synthetic_review')
# print(result_df)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
result_df = evaluate_and_compare_datasets(merged_df, 'product_name', 'rating', 'product_review', 'synthetic_review')

In [None]:
result_df

Unnamed: 0,product_name,rating,original_avg_sentence_length,synthetic_avg_sentence_length,original_avg_readability_score,synthetic_avg_readability_score,avg_semantic_similarity
0,21st Century Niacinamide 500 mg Tablets,5,48.0,16.5,61.73,80.025,0.073628
1,ACTIF Bariatric Probiotic Maximum Strength,5,30.25,38.0,71.4375,59.975,0.259224
2,Aulterra Enhance Supplement to Neutralize EMF's,4,17.0,14.5,62.68,73.51,0.178562
3,BioRebalance Restore,2,180.0,49.5,77.57,60.415,0.445584
4,BioRebalance Restore,3,42.0,62.5,82.65,67.35,0.4335
5,BioRebalance Restore,4,136.75,44.5,73.6875,71.22,0.429825
6,BioRebalance Restore,5,140.2,44.0,73.25,66.995,0.409263
7,Gold Medal Swiss Placenta 450mg,2,5.0,60.0,49.48,80.445,0.196032
8,Gold Medal Swiss Placenta 450mg,5,14.5,44.0,78.585,76.875,0.303509
9,Mag-G Tablets Magnesium Gluconate Dietary Supp...,5,10.5,17.5,77.74,66.825,0.181221
