<a href="https://colab.research.google.com/github/shreyanaarla/ShreyaNaarla_AI-project-submission/blob/main/ShreyaNaarla_AI_project_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('test.csv')

In [None]:
data.head()

In [None]:
!pip install transformers torch
from transformers import pipeline

# Initialize the sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

# Define a function to get the sentiment label
def get_sentiment_label(text):
  result = sentiment_analyzer(text)
  label = result[0]['label']
  score = result[0]['score']
  if label == 'POSITIVE' and score >= 0.5:
    return 'Positive'
  elif label == 'NEGATIVE' and score >= 0.5:
    return 'Negative'
  else:
    return 'Neutral'

# Apply the sentiment analysis to a text column in your DataFrame
# Replace 'text_column_name' with the actual name of the column containing text data
data['sentiment'] = data['body'].apply(get_sentiment_label)

# Display the DataFrame with the new sentiment column
print(data.head())

In [None]:
data.head()

Task 2

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
sns.histplot(data['sentiment'])
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Ensure the 'Date' column is in datetime format
data['date'] = pd.to_datetime(data['date'])

# Convert the 'Date' column to UTC
data['date_UTC'] = data['date'].dt.tz_localize('UTC')

# Print the DataFrame head to show the new UTC column
print(data.head())

In [None]:
data.head()

In [None]:
# Analyze sentiment trends over time
# Group by date and sentiment, and count occurrences
sentiment_over_time = data.groupby([pd.Grouper(key='date_UTC', freq='D'), 'sentiment']).size().unstack(fill_value=0)

# Plot the trends
sentiment_over_time.plot(kind='line', figsize=(12, 6))
plt.title('Sentiment Trends Over Time')
plt.xlabel('Date (UTC)')
plt.ylabel('Number of Posts')
plt.grid(True)
plt.show()

In [None]:
# Analyze sentiment distribution by 'from'
# This could reveal if certain senders tend to have more positive or negative sentiments
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='from', hue='sentiment')
plt.title('Sentiment Distribution by Sender')
plt.xlabel('Sender')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Investigate message length vs. sentiment
# Longer messages might be more likely to contain specific sentiments
data['body_length'] = data['body'].apply(lambda x: len(str(x)))
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, x='sentiment', y='body_length')
plt.title('Message Length vs. Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Message Length')
plt.show()

Task 3

In [None]:
# Define a function to assign scores based on sentiment
def assign_score(sentiment):
  if sentiment == 'Positive':
    return 1
  elif sentiment == 'Negative':
    return -1
  else:
    return 0

# Apply the function to create the 'score' column
data['score'] = data['sentiment'].apply(assign_score)

# Group by employee and sum the scores to get the total score per employee
employee_scores = data.groupby('from')['score'].sum().reset_index()

# Display the employee scores
print("\nEmployee Sentiment Scores:")
employee_scores

In [None]:
# Extract year and month
data['year_month'] = data['date'].dt.to_period('M')

# Group by employee ('from') and 'year_month', then sum the scores
monthly_scores = data.groupby(['from', 'year_month'])['score'].sum().reset_index()

# Display the monthly employee scores
print("\nMonthly Employee Sentiment Scores:")
monthly_scores

In [None]:
latest_monthly_scores = monthly_scores.groupby('from').tail(1).reset_index(drop=True)
print("\nLatest Monthly Employee Sentiment Scores:")
print(latest_monthly_scores)

Task 4

In [None]:
data.head()

In [None]:
# Initialize DataFrames to store top positive and negative employees
top_positive_employees = pd.DataFrame()
top_negative_employees = pd.DataFrame()

# Iterate through each unique month in the data
for year_month in monthly_scores['year_month'].unique():
  # Filter scores for the current month
  monthly_data = monthly_scores[monthly_scores['year_month'] == year_month].copy()

  # Sort by score in descending order for positive ranking
  sorted_monthly_positive = monthly_data.sort_values(by='score', ascending=False)

  # Get the top 3 employees, add a 'month' column for clarity, sort alphabetically by employee name for ties
  top_3_positive = sorted_monthly_positive.head(3).copy()
  top_3_positive['month'] = year_month
  top_3_positive = top_3_positive.sort_values(by=['score', 'from'], ascending=[False, True])

  # Append to the main DataFrame
  top_positive_employees = pd.concat([top_positive_employees, top_3_positive])


  # Sort by score in ascending order for negative ranking
  sorted_monthly_negative = monthly_data.sort_values(by='score', ascending=True)
  top_3_negative = sorted_monthly_negative.head(3).copy()
  top_3_negative['month'] = year_month
  top_3_negative = top_3_negative.sort_values(by=['score', 'from'], ascending=[True, True])
  top_negative_employees = pd.concat([top_negative_employees, top_3_negative])

# Reset index for cleaner presentation
top_positive_employees.reset_index(drop=True, inplace=True)
top_negative_employees.reset_index(drop=True, inplace=True)

# Display the results in tables
print("\nTop Three Positive Employees per Month:")
print(top_positive_employees[['month', 'from', 'score']])

print("\nTop Three Negative Employees per Month:")
print(top_negative_employees[['month', 'from', 'score']])


In [None]:
data.head()

Task 5

In [None]:
# Ensure 'date' is sorted
data = data.sort_values(by='date').reset_index(drop=True)

# Filter for negative sentiments only
negative_mails = data[data['sentiment'] == 'Negative'].copy()

# Group by sender
grouped_negative_mails = negative_mails.groupby('from')

flight_risk_employees = set()

# Iterate through each sender
for sender, mails in grouped_negative_mails:
  # Ensure the mails are sorted by date for the rolling window
  mails = mails.sort_values(by='date')

  # Use a rolling window of 30 days
  # The window is based on the date differences between mails for the same sender
  # Iterate through each negative mail sent by the sender
  for i in range(len(mails)):
    # Get the date of the current mail
    current_date = mails.iloc[i]['date']

    # Define the 30 day window start date
    window_start_date = current_date - pd.Timedelta(days=30)

    # Count negative mails within the 30 day window
    # Count mails from the beginning of the sender's negative mail history up to the current mail date
    # that fall within the last 30 days relative to the current mail date
    recent_negative_mails = mails[(mails['date'] >= window_start_date) & (mails['date'] <= current_date)]

    # Check if the count is 4 or more
    if len(recent_negative_mails) >= 4:
      flight_risk_employees.add(sender)
      break

# Convert the set to a list for display
flight_risk_list = list(flight_risk_employees)

print("\nEmployees identified as Flight Risks:")
print(flight_risk_list)

In [None]:
data.head()

Task 6

In [None]:
%pip install statsmodels
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Features for the model
if 'subject_length' not in data.columns:
    data['subject_length'] = data['Subject'].apply(lambda x: len(str(x)))
data['question_count'] = data['body'].apply(lambda x: str(x).count('?'))

features = ['body_length', 'subject_length', 'question_count']
X = data[features]
y = data['score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Develop a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# Interpret the model results using statsmodels for detailed summary
# Add a constant to the features for the intercept
X_train_sm = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_train_sm).fit()

print("\nLinear Regression Model Summary:")
print(model_sm.summary())

print("\nInterpretation of Results:")
print("MSE is high and R2 is low meaning prediction accuracy is low.\n")