# Employee Sentiment Analysis Project

# --------------------------
# 1. Imports and Setup
# --------------------------

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from transformers import pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import os
from transformers import pipeline

# --------------------------
# 📂 2. Load Data
# --------------------------

In [54]:
df = pd.read_csv("datasets/test.csv")  # Ensure file is in correct path
print("Data loaded successfully.")
df['employee'] = df['from'].apply(lambda x: ' '.join(x.split('@')[0].split('.')))
df.head()

Data loaded successfully.


Unnamed: 0,Subject,body,date,from,employee
0,EnronOptions Update!,EnronOptions Announcement\n\n\nWe have updated...,5/10/2010,sally.beck@enron.com,sally beck
1,(No Subject),"Marc,\n\nUnfortunately, today is not going to ...",7/29/2010,eric.bass@enron.com,eric bass
2,Phone Screen Interview - Shannon L. Burnham,"When: Wednesday, June 06, 2001 10:00 AM-11:00 ...",7/25/2011,sally.beck@enron.com,sally beck
3,RE: My new work email,we were thinking papasitos (we can meet somewh...,3/25/2010,johnny.palmer@enron.com,johnny palmer
4,Bet,Since you never gave me the $20 for the last t...,5/21/2011,lydia.delgado@enron.com,lydia delgado


In [55]:
df.tail(5)

Unnamed: 0,Subject,body,date,from,employee
2186,Re: Resume,Thanks for the resume. She has had some good ...,6/17/2011,johnny.palmer@enron.com,johnny palmer
2187,"Final Schedule - Wednesday, May 2, 2001 - Jesu...",Attached please find the following documents:\...,1/20/2011,johnny.palmer@enron.com,johnny palmer
2188,(No Subject),Good to finally hear from. Judging from your ...,1/2/2011,don.baughman@enron.com,don baughman
2189,League is Set,It looks like we have our 12 teams. We will p...,3/11/2011,rhonda.denton@enron.com,rhonda denton
2190,AirCard Activation,"We will need this, so I am sending it to you a...",10/30/2010,johnny.palmer@enron.com,johnny palmer


# --------------------------
# EDA - Structure
# --------------------------

In [56]:
print(df.info())
print(df.isnull().sum())
df['timestamp'] = pd.to_datetime(df['date'])
df['message_length'] = df['Subject'].apply(len)
df['word_count'] = df['Subject'].apply(lambda x: len(x.split()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Subject   2191 non-null   object
 1   body      2191 non-null   object
 2   date      2191 non-null   object
 3   from      2191 non-null   object
 4   employee  2191 non-null   object
dtypes: object(5)
memory usage: 85.7+ KB
None
Subject     0
body        0
date        0
from        0
employee    0
dtype: int64


# --------------------------
# Task 1 - Sentiment Labeling
# --------------------------

In [57]:
classifier = pipeline("sentiment-analysis",framework = "pt")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [58]:
# Helper function to map result to label
def get_sentiment(text):
    result = classifier(text[:512])[0]  # limit input size
    if result['label'] == 'POSITIVE':
        return 1
    elif result['label'] == 'NEGATIVE':
        return -1
    else:
        return 0

In [59]:
df['sentiment'] = df['Subject'].apply(get_sentiment)
df.to_csv("datasets/labeled_data.csv", index=False)

# --------------------------
# Task 2 - EDA Visualization
# --------------------------

In [60]:
sns.countplot(data=df, x='sentiment')
plt.title("Sentiment Distribution")
plt.savefig("datasets/sentiment_distribution.png")
plt.clf()

<Figure size 640x480 with 0 Axes>

In [61]:
sentiment_over_time = df.groupby(df['timestamp'].dt.to_period('M'))['sentiment'].mean()
sentiment_over_time.plot(kind='line')
plt.title("Average Sentiment Over Time")
plt.savefig("datasets/sentiment_over_time.png")
plt.clf()

<Figure size 640x480 with 0 Axes>

# --------------------------
# Task 3 - Employee Monthly Sentiment Score
# --------------------------

In [62]:
df['month'] = df['timestamp'].dt.to_period('M')
df['score'] = df['sentiment']  # already 1, 0, -1
monthly_scores = df.groupby(['employee', 'month'])['score'].sum().reset_index()
monthly_scores.to_csv("datasets/monthly_scores.csv", index=False)

In [63]:
rankings = []
for period, group in monthly_scores.groupby('month'):
    top = group.sort_values(by=['score', 'employee'], ascending=[False, True]).head(3)
    bottom = group.sort_values(by=['score', 'employee']).head(3)
    rankings.append((str(period), top, bottom))

# Output example ranking
print("Top 3 Positive for a sample month:")
print(rankings[0][1])

Top 3 Positive for a sample month:
          employee    month  score
144  lydia delgado  2010-01      5
192  rhonda denton  2010-01      1
0    bobette riner  2010-01      0


# --------------------------
# Task 5 - Flight Risk Identification
# --------------------------

In [64]:
negative_msgs = df[df['sentiment'] == -1].copy()
negative_msgs.sort_values(['employee', 'timestamp'], inplace=True)

In [65]:
from datetime import timedelta
def identify_risks(df):
    flagged = set()
    for eid, group in df.groupby('employee'):
        times = list(group['timestamp'])
        for i in range(len(times)-3):
            if (times[i+3] - times[i]) <= timedelta(days=30):
                flagged.add(eid)
                break
    return list(flagged)

In [66]:
flight_risks = identify_risks(negative_msgs)
print("Flight Risk Employees:", flight_risks)

Flight Risk Employees: ['john arnold', 'patti thompson', 'eric bass', 'kayne coulter', 'rhonda denton', 'lydia delgado', 'johnny palmer', 'don baughman', 'bobette riner', 'sally beck']


# --------------------------
# Task 6 - Predictive Modeling
# --------------------------

In [67]:
features = df.groupby(by=['from', 'month']).agg({
    'Subject': 'count',
    'message_length': 'mean',
    'word_count': 'sum',
    'score': 'sum'
}).rename(columns={
    'Subject': 'msg_count',
    'message_length': 'avg_msg_length',
    'word_count': 'total_word_count',
    'score': 'sentiment_score'
}).reset_index()

In [68]:
X = features[['msg_count', 'avg_msg_length', 'total_word_count']]
y = features['sentiment_score']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print(f"MAE of sentiment prediction model: {mae:.2f}")

MAE of sentiment prediction model: 2.58


# --------------------------
# 10. Summary 
# --------------------------

In [70]:
print("Top 3 Positive Employees in First Month:")
print(rankings[0][1]['employee'].values)

Top 3 Positive Employees in First Month:
['lydia delgado' 'rhonda denton' 'bobette riner']


In [72]:
print("Top 3 Negative Employees in First Month:")
print(rankings[0][2]['employee'].values)

Top 3 Negative Employees in First Month:
['kayne coulter' 'johnny palmer' 'don baughman']


In [73]:
print("Flight Risk Employees:")
print(flight_risks)

Flight Risk Employees:
['john arnold', 'patti thompson', 'eric bass', 'kayne coulter', 'rhonda denton', 'lydia delgado', 'johnny palmer', 'don baughman', 'bobette riner', 'sally beck']
