# Sentiment Analysis 

In [1]:
#%pip install openpyxl

In [2]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# =============================
# CONFIGURATION
# =============================
INPUT_FILE = "Sentiment_Analysis_Dataset.xlsx"
OUTPUT_FILE = "Sentiment_Analysis_Results.csv"

# Question columns (modify as needed)
QUESTION_COLS = [
    'How satisfied are you with your current role and responsibilities?',
    'How supported do you feel by your manager and team?',
    'How do you feel about your opportunities for career growth and learning?',
    'How was your overall work environment this month?',
    'Do you get enough breaks or downtime during work?'
]

# Sentiment thresholds
POSITIVE_THRESHOLD = 0.2
NEGATIVE_THRESHOLD = -0.2

# =============================
# HELPER FUNCTIONS
# =============================
def map_stars_to_sentiment(stars):
    """
    Convert 1-5 stars to sentiment label and numeric score.
    """
    if stars <= 2:
        sentiment = "NEGATIVE"
    elif stars == 3:
        sentiment = "NEUTRAL"
    else:
        sentiment = "POSITIVE"
    
    score = (stars - 3) / 2.0
    return sentiment, score


def extract_stars_from_label(label):
    """
    Extract star rating from model label.
    """
    try:
        stars = int(label.split()[0])
        return max(1, min(5, stars))
    except Exception:
        label_lower = label.lower()
        if "5" in label or "pos" in label_lower:
            return 5
        elif "1" in label or "neg" in label_lower:
            return 1
        else:
            return 3


def determine_overall_sentiment(avg_score):
    """
    Determine overall sentiment based on average score.
    """
    if avg_score > POSITIVE_THRESHOLD:
        return "POSITIVE"
    elif avg_score < NEGATIVE_THRESHOLD:
        return "NEGATIVE"
    else:
        return "NEUTRAL"


# =============================
# MAIN ANALYSIS FUNCTION
# =============================
def analyze_employee_sentiments(df, classifier, question_cols):
    """
    Analyze sentiment for each employee across all questions.
    """
    final_sentiments = []
    avg_sentiment_scores = []
    avg_confidence_scores = []
    
    detailed_results = {col: {'sentiment': [], 'score': [], 'confidence': []} 
                       for col in question_cols}
    
    print("\n" + "="*60)
    print("ANALYZING EMPLOYEE SENTIMENTS")
    print("="*60)
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing employees"):
        sentiment_scores = []
        model_confidences = []
        question_sentiments = []
        
        for col in question_cols:
            answer = str(row[col]).strip()
            if not answer or answer.lower() in ["nan", "none", ""]:
                detailed_results[col]['sentiment'].append(None)
                detailed_results[col]['score'].append(None)
                detailed_results[col]['confidence'].append(None)
                continue
            
            try:
                result = classifier(answer[:1000])[0]
                label = result['label']
                model_confidence = result['score']
                stars = extract_stars_from_label(label)
                sentiment_text, sentiment_score = map_stars_to_sentiment(stars)
                
                sentiment_scores.append(sentiment_score)
                model_confidences.append(model_confidence)
                question_sentiments.append(sentiment_text)
                
                detailed_results[col]['sentiment'].append(sentiment_text)
                detailed_results[col]['score'].append(sentiment_score)
                detailed_results[col]['confidence'].append(model_confidence)
                
            except Exception as e:
                print(f"\nError processing Employee {row.get('Employee_id', idx)} - {col}: {e}")
                detailed_results[col]['sentiment'].append(None)
                detailed_results[col]['score'].append(None)
                detailed_results[col]['confidence'].append(None)
                continue
        
        if sentiment_scores and model_confidences:
            avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
            avg_confidence = sum(model_confidences) / len(model_confidences)
            overall_sentiment = determine_overall_sentiment(avg_sentiment)
        else:
            avg_sentiment = 0.0
            avg_confidence = 0.0
            overall_sentiment = "NEUTRAL"
        
        final_sentiments.append(overall_sentiment)
        avg_sentiment_scores.append(round(avg_sentiment, 4))
        avg_confidence_scores.append(round(avg_confidence, 4))
    
    # Add results to dataframe
    df["Overall_Sentiment"] = final_sentiments
    df["Avg_Sentiment_Score"] = avg_sentiment_scores
    df["Avg_Confidence_Score"] = avg_confidence_scores
    
    for col in question_cols:
        short_name = col[:30].replace(" ", "_").replace("?", "")
        df[f"{short_name}_sentiment"] = detailed_results[col]['sentiment']
        df[f"{short_name}_score"] = detailed_results[col]['score']
        df[f"{short_name}_confidence"] = detailed_results[col]['confidence']
    
    # Reorder columns to keep the final three columns at the end
    cols = [c for c in df.columns if c not in ["Overall_Sentiment", "Avg_Sentiment_Score", "Avg_Confidence_Score"]]
    df = df[cols + ["Overall_Sentiment", "Avg_Sentiment_Score", "Avg_Confidence_Score"]]
    
    return df


# =============================
# MAIN EXECUTION
# =============================
def main():
    print("\n" + "="*60)
    print("EMPLOYEE SENTIMENT ANALYSIS - BERT MODEL")
    print("="*60 + "\n")
    
    print(f"Loading dataset from '{INPUT_FILE}'...")
    try:
        df = pd.read_excel(INPUT_FILE, engine="openpyxl")
        print(f"Loaded {len(df)} employee records\n")
    except Exception as e:
        print(f"Error loading file: {e}")
        return
    
    print("Verifying question columns...")
    missing_cols = [col for col in QUESTION_COLS if col not in df.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
        print(f"Available columns: {list(df.columns)}")
        return
    print("All question columns found\n")
    
    print("Loading BERT sentiment model (nlptown/bert-base-multilingual-uncased-sentiment)...")
    try:
        classifier = pipeline(
            "sentiment-analysis", 
            model="nlptown/bert-base-multilingual-uncased-sentiment",
            device=-1
        )
        print("Model loaded successfully\n")
    except Exception as e:
        print(f"Error loading model: {e}")
        return
    
    df_results = analyze_employee_sentiments(df, classifier, QUESTION_COLS)
    
    print(f"Saving results to '{OUTPUT_FILE}'...")
    try:
        df_results.to_csv(OUTPUT_FILE, index=False)
        print("Results saved successfully!\n")
    except Exception as e:
        print(f"Error saving file: {e}")
        return
    
    print("="*60)
    print("SUMMARY STATISTICS")
    print("="*60)
    sentiment_counts = df_results['Overall_Sentiment'].value_counts()
    print("\nOverall Sentiment Distribution:")
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(df_results)) * 100
        print(f"   {sentiment:12s}: {count:3d} employees ({percentage:5.1f}%)")
    
    print(f"\nAverage Sentiment Score: {df_results['Avg_Sentiment_Score'].mean():.4f} (range: -1 to +1)")
    print(f"Average Confidence Score: {df_results['Avg_Confidence_Score'].mean():.4f} (range: 0 to 1)")
    
    print("\nPreview of Results:")
    print(df_results[['Employee_id', 'Overall_Sentiment', 'Avg_Sentiment_Score', 'Avg_Confidence_Score']].head(10))
    
    print("\n" + "="*60)
    print("ANALYSIS COMPLETED SUCCESSFULLY")
    print("="*60 + "\n")


# =============================
# RUN PROGRAM
# =============================
if __name__ == "__main__":
    main()



EMPLOYEE SENTIMENT ANALYSIS - BERT MODEL

Loading dataset from 'Sentiment_Analysis_Dataset.xlsx'...
Loaded 93 employee records

Verifying question columns...
All question columns found

Loading BERT sentiment model (nlptown/bert-base-multilingual-uncased-sentiment)...


Device set to use cpu


Model loaded successfully


ANALYZING EMPLOYEE SENTIMENTS


Processing employees: 100%|██████████| 93/93 [02:40<00:00,  1.73s/it]

Saving results to 'Sentiment_Analysis_Results.csv'...
Results saved successfully!

SUMMARY STATISTICS

Overall Sentiment Distribution:
   NEUTRAL     :  52 employees ( 55.9%)
   POSITIVE    :  30 employees ( 32.3%)
   NEGATIVE    :  11 employees ( 11.8%)

Average Sentiment Score: 0.1215 (range: -1 to +1)
Average Confidence Score: 0.6388 (range: 0 to 1)

Preview of Results:
   Employee_id Overall_Sentiment  Avg_Sentiment_Score  Avg_Confidence_Score
0            3          POSITIVE                  0.4                0.7543
1           15          POSITIVE                  0.3                0.6320
2           34           NEUTRAL                  0.1                0.6297
3           46          POSITIVE                  0.3                0.6717
4           51           NEUTRAL                 -0.2                0.6473
5           52          POSITIVE                  0.6                0.6816
6           90          POSITIVE                  0.5                0.6865
7          101  




In [3]:
#This code is for sentiment analysis of each employee feedback using BERT model.

# from transformers import pipeline

# # Questions
# # ----------------------------- 
# QUESTIONS = [
#     "1) How satisfied are you with your current role and responsibilities?",
#     "2) How do you feel about the support and communication from your manager/team leads?",
#     "3) How satisfied are you with opportunities for career growth and learning here?",
#     "4) How would you describe the overall work environment this month?",
#     "5) Do you feel you have enough breaks or downtime during work?"
# ]

# # Input function
# # -----------------------------
# def read_multiline_input(prompt):
#     """
#     Read either single-line or multi-line input.
#     Double Enter ends input for paragraphs, single Enter works for short answers.
#     """
#     print(prompt + "\n(Type your answer. Press Enter twice to finish a paragraph, or Enter once if short.)")
#     lines = []
#     while True:
#         try:
#             line = input()
#         except EOFError:
#             break
#         # empty line immediately after prompt -> no input
#         if not line and not lines:
#             return ""
#         # empty line after text ends paragraph
#         if not line and lines:
#             break
#         lines.append(line)
#     return " ".join(lines).strip()

# # Map BERT stars to sentiment
# # -----------------------------
# def map_stars_to_sentiment(stars):
#     """
#     Map 1-5 stars to sentiment:
#         1-2 -> NEGATIVE
#         3   -> NEUTRAL
#         4-5 -> POSITIVE
#     Also compute numeric score in [-1, +1] where -1=most negative, 0=neutral, +1=most positive
#     """
#     if stars <= 2:
#         sentiment = "NEGATIVE"
#     elif stars == 3:
#         sentiment = "NEUTRAL"
#     else:
#         sentiment = "POSITIVE"
#     score = (stars - 3) / 2.0  # -1 .. +1
#     return sentiment, score

# # Main program
# # -----------------------------
# def main():
#     print("Loading sentiment model (BERT)... this may take a moment.")
#     classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

#     answers = []
#     for q in QUESTIONS:
#         ans = read_multiline_input(q)
#         if ans == "":
#             print("No input recorded for this question. Saving as empty string.\n")
#         answers.append(ans)

#     print("\n\nPredicting sentiments...\n")
#     results = []

#     for i, ans in enumerate(answers):
#         if ans.strip() == "":
#             print(f"Q{i+1}: No answer provided — skipping model inference.\n")
#             results.append({"label": None, "score": None, "raw": None})
#             continue

#         raw = classifier(ans[:1000])  # first 1000 chars for safety
#         # raw example: [{'label': '4 stars', 'score': 0.635}]
#         if not raw or not isinstance(raw, list):
#             print(f"Q{i+1}: Model returned unexpected output: {raw}")
#             results.append({"label": None, "score": None, "raw": raw})
#             continue

#         item = raw[0]
#         label = item.get("label", "")
#         try:
#             stars = int(label.split()[0])
#         except Exception:
#             lbl = label.upper()
#             if "POS" in lbl:
#                 stars = 5
#             elif "NEG" in lbl:
#                 stars = 1
#             else:
#                 stars = 3

#         sentiment_text, numeric_score = map_stars_to_sentiment(stars)

#         print(f"Q{i+1}: {QUESTIONS[i]}")
#         print(f"Answer: {ans}\n")
#         print(f"Model raw label: {label}, model confidence: {item.get('score'):.3f}")
#         print(f"Mapped sentiment: {sentiment_text}, Sentiment score (normalized -1..+1): {numeric_score:.3f}")
#         print("-" * 60 + "\n")

#         results.append({
#             "label": sentiment_text,
#             "score": numeric_score,
#             "raw": item
#         })

#     # Average sentiment
#     # -----------------------------
#     numeric_list = [r["score"] for r in results if r["score"] is not None]
#     if numeric_list:
#         avg = sum(numeric_list) / len(numeric_list)
#         print(f"Average sentiment score across answers: {avg:.3f}  (range -1 negative .. +1 positive)")
#     else:
#         print("No numeric sentiment scores to average.")

#     # Quick rule-of-thumb for employee retention
#     threshold = 0.2
#     if numeric_list:
#         if avg > threshold:
#             print("Quick rule-of-thumb: overall sentiment is positive → employee likely to stay.")
#         elif avg < -threshold:
#             print("Quick rule-of-thumb: overall sentiment is negative → employee may be likely to leave.")
#         else:
#             print("Quick rule-of-thumb: overall sentiment is neutral/uncertain.")
#     print("\nDone.")

# # -----------------------------
# if __name__ == "__main__":
#     main()
