In [None]:
!pip install pandas vaderSentiment textblob transformers torch nltk


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
# Install required libraries (uncomment if needed)
# !pip install pandas vaderSentiment textblob transformers torch nltk

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline

# Load dataset
input_file = "filtered_deepseek_english_data.csv"  # Change to your file name
df = pd.read_csv(input_file)

# Drop rows with missing text values
df.dropna(subset=["text"], inplace=True)

# Convert 'text' to string to avoid errors
df["text"] = df["text"].astype(str).str.strip()

# Initialize sentiment analysis models
vader_analyzer = SentimentIntensityAnalyzer()
bert_analyzer = pipeline("sentiment-analysis")

# Function for VADER sentiment analysis
def get_vader_score(text):
    return vader_analyzer.polarity_scores(text)["compound"]

# Function for TextBlob sentiment analysis
def get_textblob_score(text):
    return TextBlob(text).sentiment.polarity

# Function for BERT sentiment analysis
def get_bert_score(text):
    text = text[:512]  # Limit text length to 512 tokens for BERT
    if not text:  # Handle empty strings
        return 0

    result = bert_analyzer(text)
    label = result[0]["label"].lower()
    score = result[0]["score"]

    return score if label == "positive" else -score if label == "negative" else 0

# Apply sentiment analysis functions
df["VADER Score"] = df["text"].apply(get_vader_score)
df["TextBlob Score"] = df["text"].apply(get_textblob_score)
df["BERT Score"] = df["text"].apply(get_bert_score)

# Compute average sentiment score
df["Average Sentiment"] = df[["VADER Score", "TextBlob Score", "BERT Score"]].mean(axis=1)

# Keep only necessary columns
df = df[["pseudo_id", "text", "VADER Score", "TextBlob Score", "BERT Score", "Average Sentiment"]]

# Save the results to a new CSV file
output_file = "final_sentiment_analysis.csv"
df.to_csv(output_file, index=False)

print(f"Sentiment analysis completed and saved in '{output_file}'.")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sentiment analysis completed and saved in 'final_sentiment_analysis.csv'.


In [None]:
import pandas as pd

# Load the existing sentiment analysis results
input_file = "final_sentiment_analysis.csv"
df = pd.read_csv(input_file)

# Define function to classify sentiment based on the average score
def classify_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply classification
df["Sentiment Label"] = df["Average Sentiment"].apply(classify_sentiment)

# Save the updated CSV file
output_file = "final_sentiment_analysis_with_labels.csv"
df.to_csv(output_file, index=False)

print(f"Sentiment classification added and saved in '{output_file}'.")


Sentiment classification added and saved in 'final_sentiment_analysis_with_labels.csv'.


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Load dataset
input_file = "filtered_deepseek_english_data.csv"  # Update with your file name
df = pd.read_csv(input_file)

# Initialize sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

def get_vader_score(text):
    return vader_analyzer.polarity_scores(text)["compound"]

def get_textblob_score(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
df["VADER Score"] = df["text"].astype(str).apply(get_vader_score)
df["TextBlob Score"] = df["text"].astype(str).apply(get_textblob_score)

# Compute average sentiment score
df["Average Sentiment"] = df[["VADER Score", "TextBlob Score"]].mean(axis=1)

# Function to classify sentiment labels
def classify_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply classification
df["Sentiment Label"] = df["Average Sentiment"].apply(classify_sentiment)

# Keep only relevant columns
df = df[["pseudo_id", "text", "VADER Score", "TextBlob Score", "Average Sentiment", "Sentiment Label"]]

# Save results
output_file = "sentiment_comparison.csv"
df.to_csv(output_file, index=False)

print(f"Sentiment analysis with VADER and TextBlob completed. Results saved to '{output_file}'.")


Sentiment analysis with VADER and TextBlob completed. Results saved to 'sentiment_comparison.csv'.


In [None]:
import pandas as pd

# Load the two CSV files
final_sentiment_file = "final_sentiment_analysis_with_labels.csv"
comparison_file = "sentiment_comparison.csv"

df_final = pd.read_csv(final_sentiment_file)
df_comparison = pd.read_csv(comparison_file)

# Merge datasets on 'pseudo_id'
merged_df = df_final.merge(df_comparison, on="pseudo_id", suffixes=("_Final", "_V&T"))

# Keep relevant columns
merged_df = merged_df[
    [
        "pseudo_id",
        "text_Final",
        "VADER Score_Final", "TextBlob Score_Final", "BERT Score", "Sentiment Label_Final",
        "VADER Score_V&T", "TextBlob Score_V&T", "Sentiment Label_V&T"
    ]
]

# Rename for clarity
merged_df.rename(columns={"text_Final": "text"}, inplace=True)

# Add column to compare sentiment labels
merged_df["Labels Match"] = merged_df.apply(
    lambda row: "Match" if row["Sentiment Label_Final"] == row["Sentiment Label_V&T"] else "Different", axis=1
)

# Save comparison results
output_file = "sentiment_comparison_with_labels_match.csv"
merged_df.to_csv(output_file, index=False)

print(f"Comparison completed and saved in '{output_file}'.")


Comparison completed and saved in 'sentiment_comparison_with_labels_match.csv'.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from docx import Document
from docx.shared import Inches
import nltk
import os

# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# --- Setup ---
input_csv = "final_sentiment_analysis_with_labels.csv"
output_dir = "sentiment_report_visuals"
report_file = "Sentiment_Analysis_Report.docx"
os.makedirs(output_dir, exist_ok=True)

# Load dataset
df = pd.read_csv(input_csv)

# Count sentiment classes
sentiment_counts = df["Sentiment Label"].value_counts()
total = len(df)
positive_count = sentiment_counts.get("Positive", 0)
neutral_count = sentiment_counts.get("Neutral", 0)
negative_count = sentiment_counts.get("Negative", 0)
dominant_sentiment = sentiment_counts.idxmax()

# --- Generate Bar Chart ---
sns.set(style="whitegrid")
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x="Sentiment Label", order=["Positive", "Neutral", "Negative"], palette="pastel")
plt.title("Sentiment Distribution (Bar Chart)")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.tight_layout()
bar_chart_path = f"{output_dir}/sentiment_bar_chart.png"
plt.savefig(bar_chart_path)
plt.close()

# --- Generate Pie Chart ---
colors = ["#8fd694", "#f7d794", "#ff6b6b"]
plt.figure(figsize=(6, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct="%1.1f%%", startangle=140, colors=colors)
plt.axis("equal")
plt.title("Sentiment Distribution (Pie Chart)")
pie_chart_path = f"{output_dir}/sentiment_pie_chart.png"
plt.savefig(pie_chart_path)
plt.close()

# --- Generate Word Clouds ---
stop_words = set(stopwords.words("english"))

def generate_wordcloud(sentiment_label, color_map):
    text_data = " ".join(df[df["Sentiment Label"] == sentiment_label]["text"].dropna().astype(str))
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        stopwords=STOPWORDS.union(stop_words),
        colormap=color_map
    ).generate(text_data)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud - {sentiment_label}")
    cloud_path = f"{output_dir}/wordcloud_{sentiment_label.lower()}.png"
    plt.savefig(cloud_path)
    plt.close()
    return cloud_path

positive_wc = generate_wordcloud("Positive", "Greens")
neutral_wc = generate_wordcloud("Neutral", "Blues")
negative_wc = generate_wordcloud("Negative", "Reds")

# --- Create DOCX Report ---
doc = Document()
doc.add_heading("Sentiment Analysis Report", 0)

# Overview
doc.add_heading("Overview", level=1)
doc.add_paragraph(
    f"This report summarizes sentiment analysis results from the file '{input_csv}', "
    f"which includes {total} text entries. Sentiment labels were generated using VADER, "
    f"TextBlob, and BERT scores. The final label is based on the average sentiment score."
)

# Distribution Charts
doc.add_heading("Sentiment Distribution", level=1)
doc.add_picture(bar_chart_path, width=Inches(5.5))
doc.add_paragraph("Bar chart showing the count of Positive, Neutral, and Negative sentiments.")

doc.add_picture(pie_chart_path, width=Inches(5.5))
doc.add_paragraph("Pie chart showing sentiment proportions.")

# Word Clouds
doc.add_heading("Word Clouds", level=1)

doc.add_paragraph("Word Cloud - Positive")
doc.add_picture(positive_wc, width=Inches(5.5))

doc.add_paragraph("Word Cloud - Neutral")
doc.add_picture(neutral_wc, width=Inches(5.5))

doc.add_paragraph("Word Cloud - Negative")
doc.add_picture(negative_wc, width=Inches(5.5))

# Observations
doc.add_heading("Key Observations", level=1)
doc.add_paragraph(f"• Most of the texts were classified as **{dominant_sentiment}**.")
doc.add_paragraph(f"• Positive: {positive_count}")
doc.add_paragraph(f"• Neutral: {neutral_count}")
doc.add_paragraph(f"• Negative: {negative_count}")
doc.add_paragraph("• The combination of three sentiment models ensures a more reliable classification.")

# Save Report
doc.save(report_file)
print(f"✅ DOCX report saved as '{report_file}'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x="Sentiment Label", order=["Positive", "Neutral", "Negative"], palette="pastel")


✅ DOCX report saved as 'Sentiment_Analysis_Report.docx'
