# Sentiment Analysis Report

This notebook analyzes survey responses using a state-of-the-art RoBERTa-based sentiment analysis model.

## Instructions
1. Place your survey data in a CSV file with a column named 'text' or 'response'
2. Update the `DATA_FILE` variable below with your file path
3. Run all cells to generate the sentiment analysis report


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


In [None]:
# Configuration
DATA_FILE = 'survey_data.csv'  # Update this with your survey data file path
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'  # State-of-the-art RoBERTa model

# Check if CUDA is available for GPU acceleration
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU (CUDA)' if device == 0 else 'CPU'}")


In [None]:
# Load the sentiment analysis pipeline
print("Loading sentiment analysis model...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=MODEL_NAME,
    tokenizer=MODEL_NAME,
    device=device,
    return_all_scores=True
)
print("Model loaded successfully!")


In [None]:
# Load survey data
try:
    df = pd.read_csv(DATA_FILE)
    print(f"Data loaded successfully! Found {len(df)} responses.")
    print(f"\nColumns in dataset: {list(df.columns)}")
    print(f"\nFirst few rows:")
    display(df.head())
except FileNotFoundError:
    print(f"Error: File '{DATA_FILE}' not found.")
    print("Please update the DATA_FILE variable with the correct path to your survey data.")
    print("\nExpected CSV format:")
    print("- Column named 'text' or 'response' containing the survey responses")
    print("- Additional columns (optional): date, respondent_id, question_id, etc.")


In [None]:
# Identify the text column
text_column = None
for col in ['text', 'response', 'comment', 'feedback', 'answer']:
    if col in df.columns:
        text_column = col
        break

if text_column is None:
    print("Available columns:", list(df.columns))
    text_column = input("Enter the name of the column containing text responses: ")

print(f"Using column '{text_column}' for sentiment analysis")

# Clean the data - remove empty responses
df_clean = df[df[text_column].notna() & (df[text_column].str.strip() != '')].copy()
print(f"\nCleaned data: {len(df_clean)} valid responses (removed {len(df) - len(df_clean)} empty responses)")


In [None]:
# Perform sentiment analysis
print("Analyzing sentiment for all responses...")
print("This may take a few minutes depending on the number of responses...")

sentiments = []
scores = []

for text in tqdm(df_clean[text_column], desc="Processing responses"):
    try:
        # Get sentiment scores
        result = sentiment_pipeline(text[:512])  # Limit to 512 characters for model input
        
        # Extract the highest confidence sentiment
        best_sentiment = max(result[0], key=lambda x: x['score'])
        sentiments.append(best_sentiment['label'])
        scores.append(best_sentiment['score'])
    except Exception as e:
        print(f"\nError processing text: {str(e)}")
        sentiments.append('ERROR')
        scores.append(0.0)

# Add results to dataframe
df_clean['sentiment'] = sentiments
df_clean['sentiment_score'] = scores

print("\nSentiment analysis complete!")


In [None]:
# Map sentiment labels to standard format (POSITIVE, NEGATIVE, NEUTRAL)
# The RoBERTa model uses LABEL_0, LABEL_1, LABEL_2 format
# We need to map them to readable labels

# Get a sample to understand the label format
sample_result = sentiment_pipeline("I love this product!")
print("Sample sentiment labels:")
for item in sample_result[0]:
    print(f"  {item['label']}: {item['score']:.4f}")

# Map labels based on the model's output
# The twitter-roberta-base-sentiment-latest model uses: LABEL_0 (negative), LABEL_1 (neutral), LABEL_2 (positive)
label_mapping = {
    'LABEL_0': 'NEGATIVE',
    'LABEL_1': 'NEUTRAL',
    'LABEL_2': 'POSITIVE'
}

# Apply mapping
df_clean['sentiment_label'] = df_clean['sentiment'].map(label_mapping).fillna(df_clean['sentiment'])

print("\nSentiment distribution:")
print(df_clean['sentiment_label'].value_counts())


In [None]:
# Summary Statistics
print("=" * 60)
print("SENTIMENT ANALYSIS SUMMARY")
print("=" * 60)
print(f"Total Responses Analyzed: {len(df_clean)}")
print(f"\nSentiment Distribution:")
sentiment_counts = df_clean['sentiment_label'].value_counts()
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df_clean)) * 100
    print(f"  {sentiment}: {count} ({percentage:.2f}%)")

print(f"\nAverage Sentiment Score: {df_clean['sentiment_score'].mean():.4f}")
print(f"Median Sentiment Score: {df_clean['sentiment_score'].median():.4f}")

# Calculate sentiment ratio
positive_count = len(df_clean[df_clean['sentiment_label'] == 'POSITIVE'])
negative_count = len(df_clean[df_clean['sentiment_label'] == 'NEGATIVE'])
neutral_count = len(df_clean[df_clean['sentiment_label'] == 'NEUTRAL'])

if negative_count > 0:
    pos_neg_ratio = positive_count / negative_count
    print(f"\nPositive to Negative Ratio: {pos_neg_ratio:.2f}")

print("=" * 60)


In [None]:
# Visualization 1: Sentiment Distribution Pie Chart
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Pie chart
sentiment_counts = df_clean['sentiment_label'].value_counts()
colors = {'POSITIVE': '#2ecc71', 'NEGATIVE': '#e74c3c', 'NEUTRAL': '#95a5a6'}
pie_colors = [colors.get(sent, '#3498db') for sent in sentiment_counts.index]

axes[0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', 
            colors=pie_colors, startangle=90)
axes[0].set_title('Sentiment Distribution', fontsize=14, fontweight='bold')

# Bar chart
axes[1].bar(sentiment_counts.index, sentiment_counts.values, 
            color=[colors.get(sent, '#3498db') for sent in sentiment_counts.index])
axes[1].set_title('Sentiment Counts', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of Responses')
axes[1].set_xlabel('Sentiment')

plt.tight_layout()
plt.show()


In [None]:
# Visualization 2: Sentiment Score Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Histogram
axes[0].hist(df_clean['sentiment_score'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Sentiment Confidence Scores', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Confidence Score')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df_clean['sentiment_score'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df_clean["sentiment_score"].mean():.3f}')
axes[0].legend()

# Box plot by sentiment
sentiment_order = ['POSITIVE', 'NEUTRAL', 'NEGATIVE']
sentiment_order = [s for s in sentiment_order if s in df_clean['sentiment_label'].values]
df_clean_sorted = df_clean[df_clean['sentiment_label'].isin(sentiment_order)]

box_data = [df_clean_sorted[df_clean_sorted['sentiment_label'] == sent]['sentiment_score'].values 
            for sent in sentiment_order]
axes[1].boxplot(box_data, labels=sentiment_order)
axes[1].set_title('Sentiment Score Distribution by Category', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Confidence Score')
axes[1].set_xlabel('Sentiment')

plt.tight_layout()
plt.show()


In [None]:
# Sample responses by sentiment
print("=" * 60)
print("SAMPLE RESPONSES BY SENTIMENT")
print("=" * 60)

for sentiment in ['POSITIVE', 'NEUTRAL', 'NEGATIVE']:
    if sentiment in df_clean['sentiment_label'].values:
        print(f"\n{sentiment} Responses (Top 3 by confidence):")
        sentiment_df = df_clean[df_clean['sentiment_label'] == sentiment].nlargest(3, 'sentiment_score')
        for idx, row in sentiment_df.iterrows():
            text_preview = str(row[text_column])[:100] + "..." if len(str(row[text_column])) > 100 else str(row[text_column])
            print(f"  [{row['sentiment_score']:.3f}] {text_preview}")


In [None]:
# Export results to CSV
output_file = f'sentiment_analysis_results_{pd.Timestamp.now().strftime("%Y%m%d")}.csv'
df_clean.to_csv(output_file, index=False)
print(f"Results exported to: {output_file}")
print(f"\nFile contains {len(df_clean)} rows with sentiment analysis results.")


In [None]:
# Generate a summary report text file
report_file = f'sentiment_report_{pd.Timestamp.now().strftime("%Y%m%d")}.txt'

with open(report_file, 'w') as f:
    f.write("SENTIMENT ANALYSIS REPORT\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Report Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write(f"Total Responses Analyzed: {len(df_clean)}\n\n")
    f.write("Sentiment Distribution:\n")
    sentiment_counts = df_clean['sentiment_label'].value_counts()
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(df_clean)) * 100
        f.write(f"  {sentiment}: {count} ({percentage:.2f}%)\n")
    f.write(f"\nAverage Sentiment Score: {df_clean['sentiment_score'].mean():.4f}\n")
    f.write(f"Median Sentiment Score: {df_clean['sentiment_score'].median():.4f}\n")
    
    if negative_count > 0:
        pos_neg_ratio = positive_count / negative_count
        f.write(f"\nPositive to Negative Ratio: {pos_neg_ratio:.2f}\n")

print(f"Summary report saved to: {report_file}")
