In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
#from datasets import Dataset
import torch


# Load the fine-tuned BERT model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# English, Dutch, German, French, Spanish, and Italian

# Set device to GPU if available
device = 0 if torch.cuda.is_available() else -1

# Hugging Face pipeline for sentiment analysis with the device argument
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, batch_size=8, max_length=512, device=device, truncation=True)

# Truncate text to fit the model's maximum token length
def truncate_text(text, max_length=512):
    # Adjust max length to account for special tokens
    max_length -= tokenizer.num_special_tokens_to_add(pair=False)

    # Tokenize and truncate
    tokens = tokenizer.tokenize(text)
    if len(tokens) > max_length:
        tokens = tokens[:max_length]

    # Reconstruct text from truncated tokens
    return tokenizer.convert_tokens_to_string(tokens)


# Function to get sentiment scores with logging every 10,000 rows
def get_sentiment_scores_bert(df):
    sentiment_scores = []
    start_time = time.time()
    for index, text in enumerate(df['clean_comment']):
        # Truncate the text
     #   truncated_text = truncate_text(text)

        # Get the sentiment score
        result = sentiment_pipeline(text)[0]
        score = int(result['label'].split()[0])  # Convert "5 stars" -> 5

        sentiment_scores.append(score)

        # Log progress every 10,000 rows
        if (index + 1) % 10000 == 0:
            elapsed_time = time.time() - start_time  # Calculate elapsed time
            print(f"Processed {index + 1} rows. Time elapsed: {elapsed_time:.2f} seconds")

    return sentiment_scores

file_path = '/content/drive/MyDrive/full_cleaned.csv'

filter_df = pd.read_csv(file_path)
filter_df.info()

# Apply the function to the DataFrame
filter_df['sentiment_score_bert'] = get_sentiment_scores_bert(filter_df)

# Display the results
print(filter_df)

filter_df.to_csv(file_path, index=False)

In [None]:
import pandas as pd

# Assuming 'merged_edges_reviews' is your DataFrame and it contains:
# 'stars' - actual ratings (numeric)
# 'sentiment_score_bert' - generated sentiment scores (numeric)

# Calculate the correlation between 'stars' and 'sentiment_score_bert'
correlation = filter_df[['stars', 'sentiment_score_bert']].corr()

# Print the correlation matrix
print(correlation)

# Alternatively, if you want just the Pearson correlation value between the two columns:
pearson_corr = correlation.loc['stars', 'sentiment_score_bert']
print(f"Pearson correlation between actual stars and BERT-generated stars: {pearson_corr}")

                         stars  sentiment_score_bert
stars                 1.000000              0.514099
sentiment_score_bert  0.514099              1.000000
Pearson correlation between actual stars and BERT-generated stars: 0.5140989053352806
