In [33]:
import pandas as pd
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import Dataset, DataLoader

In [34]:
file_path='/content/training.csv'

In [35]:
def load_dataset(file_path):
    dataset = pd.read_csv(file_path)
    print("Dataset loaded. First few rows:")
    print(dataset.head())
    return dataset

In [59]:
def preprocess_dataset(dataset):
    texts = dataset['text'].tolist()
    # Encode emotions: positive as 1, neutral as 0, and negative as -1
    labels = dataset['label'].tolist()

    # Check dataset structure
    print(f"Sample texts: {texts[:2]}")
    print(f"Sample labels: {labels[:2]}")

    return texts, labels

In [60]:
emotion_mapping = {
    'joy': 'positive',
    'surprise': 'positive',
    'trust': 'positive',
    'love': 'positive',
    'neutral': 'neutral',

    'anger': 'negative',
    'fear': 'negative',
    'sadness': 'negative',
    'disgust': 'negative'
}

def map_emotion_to_polarity(emotion):
    return emotion_mapping.get(emotion, 'neutral')

In [61]:
def load_emotion_model():
    tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
    emotion_model = pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base', return_all_scores=True)
    return emotion_model

In [62]:
def analyze_emotions_polarity(emotion_model, tweets):
    emotions = emotion_model(tweets)

    # Convert predicted emotions to the defined labels
    polarities = []
    for tweet_emotions in emotions:
        # Select the emotion with the highest score
        main_emotion = max(tweet_emotions, key=lambda x: x['score'])['label']
        polarity = map_emotion_to_polarity(main_emotion)
        polarities.append(polarity)  # Append the polarity directly (1, 0, -1)

    return polarities

In [63]:
def evaluate_model(true_labels, predicted_labels):
    # Filter out -1 labels for evaluation
    filtered_labels = [(true, pred) for true, pred in zip(true_labels, predicted_labels) if true != -1]



    true_labels_filtered, predicted_labels_filtered = zip(*filtered_labels)

    accuracy = accuracy_score(true_labels_filtered, predicted_labels_filtered)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels_filtered, predicted_labels_filtered, average='weighted')  # Use weighted for multi-class

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1

In [64]:
df = load_dataset(file_path)

Dataset loaded. First few rows:
                                                text  label  Unnamed: 2  \
0                            i didnt feel humiliated     10         NaN   
1  i can go from feeling so hopeless to so damned...     10         NaN   
2   im grabbing a minute to post i feel greedy wrong     20         NaN   
3  i am ever feeling nostalgic about the fireplac...     10         NaN   
4                               i am feeling grouchy     20         NaN   

   Unnamed: 3  Unnamed: 4  Unnamed: 5   Unnamed: 6 Unnamed: 7  
0         NaN         NaN         NaN          NaN        NaN  
1         NaN         NaN         NaN          NaN        NaN  
2         NaN         NaN         NaN           10         10  
3         NaN         NaN         NaN  10,10,10,10     20,20,  
4         NaN         NaN         NaN          NaN        NaN  


In [65]:
emotion_model = load_emotion_model()



In [66]:
texts,label=preprocess_dataset(df)

Sample texts: ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']
Sample labels: [10, 10]


In [67]:
def get_random_samples(texts, labels, sample_size=1000):
    # Zip the texts and labels together for consistent pairing
    combined = list(zip(texts, labels))

    # Randomly sample 1000 pairs of texts and labels
    sampled_combined = random.sample(combined, sample_size)

    # Unzip the sampled texts and labels back into separate lists
    sampled_texts, sampled_labels = zip(*sampled_combined)

    return list(sampled_texts), list(sampled_labels)

In [68]:
import random
sampled_texts, sampled_labels = get_random_samples(texts, label, sample_size=1000)


In [69]:
pred = analyze_emotions_polarity(emotion_model, sampled_texts)

In [70]:
pred = [10 if label == 'positive' else 20 for label in pred]

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_predictions(pred, actual):
    # Ensure the lists are the same length
    if len(pred) != len(actual):
        print("Error: Lengths of predicted and actual labels do not match.")
        return

    # Calculate accuracy
    accuracy = accuracy_score(actual, pred)

    # Calculate precision, recall, and F1 score (for multi-class classification)
    precision = precision_score(actual, pred, average='weighted')  # 'weighted' accounts for class imbalance
    recall = recall_score(actual, pred, average='weighted')
    f1 = f1_score(actual, pred, average='weighted')

    # Generate a classification report
    report = classification_report(actual, pred)

    # Print the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(report)

In [74]:
evaluate_predictions(pred, sampled_labels)

Accuracy: 0.6570
Precision: 0.8403
Recall: 0.6570
F1 Score: 0.6795

Classification Report:
              precision    recall  f1-score   support

          10       0.98      0.56      0.71       759
          20       0.41      0.96      0.57       241

    accuracy                           0.66      1000
   macro avg       0.69      0.76      0.64      1000
weighted avg       0.84      0.66      0.68      1000

