In [None]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict
from google.colab import drive

In [None]:
# Mount GDrive data
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
dataset_path = '/content/drive/My Drive/Datasets/Walmart_Sparkathon/products_sales_reviews_diverse.csv'
df = pd.read_csv(dataset_path)
print(df.head())  # Testing if the data is being accessed properly or not

   ProductID          ProductName     Category  Price  QuantitySold  \
0          1  Wireless Headphones  Electronics  59.99           150   
1          2    Organic Green Tea    Beverages  14.99           200   
2          3             Yoga Mat       Sports  24.99           120   
3          4        LED Desk Lamp         Home  39.99            80   
4          5     Smartphone Stand  Accessories   9.99           300   

   TotalSales  ReviewRating                                         ReviewText  
0      8998.5             5  Absolutely love this product! Exceeded all my ...  
1      2998.0             4       Very good, but there's room for improvement.  
2      2998.8             4    Great product, but could use some enhancements.  
3      3199.2             3  Average quality, expected a bit more for the p...  
4      2997.0             2      Below average quality, wouldn't recommend it.  


In [None]:
# Select necessary columns
df = df[['ProductID', 'ProductName', 'ReviewText', 'Category']]

# Preprocess data
def preprocess_data(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

df['cleaned_Review'] = df['ReviewText'].apply(preprocess_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_Review'] = df['ReviewText'].apply(preprocess_data)


In [None]:
# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', return_all_scores=True)

# Perform sentiment analysis and capture both positive and negative scores
def extract_scores(sentiment):
    scores = {item['label']: item['score'] for item in sentiment}
    positive_score = scores.get('POSITIVE', 0)
    negative_score = scores.get('NEGATIVE', 0)
    return positive_score, negative_score

df['positive_score'], df['negative_score'] = zip(*df['cleaned_Review'].apply(lambda x: extract_scores(sentiment_pipeline(x)[0])))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [None]:
# Calculate Net Sentiment Score
df['net_sentiment_score'] = 2 * df['positive_score'] - 1

# Transform Net Sentiment Score to a 1-10 scale
df['sentiment_score_1_10'] = ((df['net_sentiment_score'] + 1) / 2) * 9 + 1

# Calculate average scores for each product
product_sentiment = defaultdict(list)

for _, row in df.iterrows():
    product_sentiment[row['ProductID']].append({
        'ProductName': row['ProductName'],
        'Category': row['Category'],
        'PositiveScore': row['positive_score'],
        'NegativeScore': row['negative_score'],
        'NetSentimentScore': row['net_sentiment_score'],
        'SentimentScore1_10': row['sentiment_score_1_10']
    })

In [None]:
# Calculate average positive, negative, net sentiment, and scaled sentiment scores for each product
avg_product_sentiments = {}
for product, entries in product_sentiment.items():
    avg_positive_score = sum(entry['PositiveScore'] for entry in entries) / len(entries)
    avg_negative_score = sum(entry['NegativeScore'] for entry in entries) / len(entries)
    avg_net_sentiment_score = sum(entry['NetSentimentScore'] for entry in entries) / len(entries)
    avg_sentiment_score_1_10 = sum(entry['SentimentScore1_10'] for entry in entries) / len(entries)
    product_name = entries[0]['ProductName']
    category = entries[0]['Category']
    avg_product_sentiments[product] = (product_name, category, avg_positive_score, avg_negative_score, avg_net_sentiment_score, avg_sentiment_score_1_10)

In [None]:
# Rank products by the 1-10 scaled sentiment score
ranked_products = sorted(avg_product_sentiments.items(), key=lambda x: x[1][5], reverse=True)

# Convert to DataFrame
ranked_df = pd.DataFrame(
    [(product, product_name, category, pos_score, neg_score, net_score, scale_score)
     for product, (product_name, category, pos_score, neg_score, net_score, scale_score) in ranked_products],
    columns=['ProductId', 'ProductName', 'Category', 'AvgPositiveScore', 'AvgNegativeScore', 'NetSentimentScore', 'SentimentScore1_10'])

# Calculate and add average positive, negative, net sentiment, and scaled sentiment scores for each category
category_scores = ranked_df.groupby('Category').agg({
    'AvgPositiveScore': 'mean',
    'AvgNegativeScore': 'mean',
    'NetSentimentScore': 'mean',
    'SentimentScore1_10': 'mean'
}).reset_index()

In [None]:
# Save to CSV
ranked_df.to_csv('ranked_products_with_net_sentiments.csv', index=False)
category_scores.to_csv('category_avg_net_sentiments.csv', index=False)

print("Ranked Products based on 1-10 Sentiment Scores:")
print(ranked_df.head(10))

print("\nCategory Average Sentiment Scores:")
print(category_scores)

Ranked Products based on 1-10 Sentiment Scores:
   ProductId  ProductName     Category  AvgPositiveScore  AvgNegativeScore  \
0         14   Product 14         Home          0.999877          0.000123   
1         26   Product 26         Toys          0.999877          0.000123   
2         79   Product 79  Accessories          0.999877          0.000123   
3         89   Product 89        Books          0.999877          0.000123   
4        128  Product 128      Fashion          0.999877          0.000123   
5        138  Product 138         Home          0.999877          0.000123   
6        166  Product 166       Sports          0.999877          0.000123   
7        174  Product 174        Books          0.999877          0.000123   
8        182  Product 182  Accessories          0.999877          0.000123   
9        249  Product 249    Beverages          0.999877          0.000123   

   NetSentimentScore  SentimentScore1_10  
0           0.999754            9.998895  
1      

In [None]:
# Download files
from google.colab import files
files.download('ranked_products_with_net_sentiments.csv')
files.download('category_avg_net_sentiments.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>