In [1]:
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm


  torch.utils._pytree._register_pytree_node(


In [2]:

# Load your dataset
#df1 = pd.read_csv("train.tsv",sep='\t', header = None, names = ['text', 'labelnum','id'])
#df2 = pd.read_csv("test.tsv",sep='\t', header = None, names = ['text', 'labelnum','id'])
df = pd.read_csv("GoEmo.csv", sep = ',', header = None, names = ['text', 'labelnum'])
#df = pd.concat([df1, df2])
#del df['id']
df.head()

Unnamed: 0,text,labelnum
0,"Wow, this is absolutely breathtaking! You're i...",0
1,I can't get over how amazing your skills are. ...,0
2,This is the kind of content that makes me love...,0
3,I'm in awe of your creativity. How do you come...,0
4,You've outdone yourself once again. I have so ...,0


In [3]:

#mask = df['labelnum'].str.contains(',', na = False)
#df = df[~mask]
#mask2 = df['labelnum'].str.contains('27', na = False)
#df = df[~mask2]
df['labelnum'].unique()


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=int64)

In [4]:
#You can take less than 50 records of each emotion by changing number in this function
#def subset_data(group):
#    return group.sample(min(50, len(group)))

In [5]:
# Apply the function to each group
#subset_df = df.groupby('labelnum', group_keys=False).apply(subset_data)
# Display the resulting subset dataframe
print(df)

                                                   text  labelnum
0     Wow, this is absolutely breathtaking! You're i...         0
1     I can't get over how amazing your skills are. ...         0
2     This is the kind of content that makes me love...         0
3     I'm in awe of your creativity. How do you come...         0
4     You've outdone yourself once again. I have so ...         0
...                                                 ...       ...
1395  Thumbs up if your emotional barometer is right...        27
1396  Watching this feels like sailing on the sea of...        27
1397  Thumbs up if you're leaving this video with a ...        27
1398  This content is like a neutral background nois...        27
1399  Thumbs up if your emotional state is cruising ...        27

[1400 rows x 2 columns]


In [6]:

# Set your hyperparameters
lr = 2e-5
adam_epsilon = 1e-7
epochs = 3
num_warmup_steps = 100


In [7]:
from torch.nn import BCEWithLogitsLoss

# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, num_labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_labels = num_labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.zeros(self.num_labels)  # Initialize with zeros for all labels
        label_indices = [int(idx) for idx in str(self.labels[idx]).split(',')]
        label[label_indices] = 1  # Set the corresponding label indices to 1

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }


In [8]:

# Load pre-trained model and tokenizer
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=28)



In [9]:
# Prepare your data
train_texts = df['text'].tolist()
train_labels = df['labelnum'].tolist()
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128, num_labels=28)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [10]:

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr, eps=adam_epsilon, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=len(train_dataloader) * epochs)




In [11]:

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss, roc_auc_score, average_precision_score

loss_fn = BCEWithLogitsLoss()

#Re-run below code for different metrics, loss functions and evaluations as mentioned in csv
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        #loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} - Average Training Loss: {average_loss}')


# Save the fine-tuned model
model.save_pretrained("fine_tuned_roberta_go_emotions_new")
tokenizer.save_pretrained("fine_tuned_roberta_go_emotions_new")

Epoch 1/3: 100%|███████████████████████████████████████████████████████████████████████| 88/88 [07:25<00:00,  5.06s/it]


Epoch 1/3 - Average Training Loss: 0.045116644649004396


Epoch 2/3: 100%|███████████████████████████████████████████████████████████████████████| 88/88 [07:31<00:00,  5.13s/it]


Epoch 2/3 - Average Training Loss: 0.022130377156744627


Epoch 3/3: 100%|███████████████████████████████████████████████████████████████████████| 88/88 [07:32<00:00,  5.14s/it]


Epoch 3/3 - Average Training Loss: 0.016966043325903065


('fine_tuned_roberta_go_emotions_new\\tokenizer_config.json',
 'fine_tuned_roberta_go_emotions_new\\special_tokens_map.json',
 'fine_tuned_roberta_go_emotions_new\\vocab.json',
 'fine_tuned_roberta_go_emotions_new\\merges.txt',
 'fine_tuned_roberta_go_emotions_new\\added_tokens.json')

In [27]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model_path = "fine_tuned_roberta_go_emotions_new"
model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)


In [28]:
test_text = "Watching this video was exhilirating. The ultimate adrenaline rush."
encoding = tokenizer(test_text, truncation=True, padding=True, max_length=128, return_tensors="pt")


In [29]:
model.eval()
with torch.no_grad():
    logits = model(**encoding).logits


In [30]:
threshold = 0.5
predicted_labels = (torch.sigmoid(logits) > threshold).int()


In [31]:
print("Predicted Labels:", predicted_labels)


Predicted Labels: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]], dtype=torch.int32)


In [32]:
# Given ordered list of emotions
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", 
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", 
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", 
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]


In [33]:
# Convert the predicted labels tensor to a list of emotions
predicted_labels_list = [emotion_labels[i] for i, value in enumerate(predicted_labels[0]) if value == 1]

# Print the encoded emotion labels
print("Encoded Emotion Label:", predicted_labels_list)


Encoded Emotion Label: ['excitement']


In [34]:
# Convert the predicted labels tensor to a string of emotions
predicted_labels_str = ", ".join(emotion_labels[i] for i, value in enumerate(predicted_labels[0]) if value == 1)

# Print the encoded emotion labels
print("Encoded Emotion Labels:", predicted_labels_str)


Encoded Emotion Labels: excitement


In [35]:
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch, os
import googleapiclient.discovery


In [36]:
# Load the fine-tuned model and tokenizer
model_path = "fine_tuned_roberta_go_emotions_new"
model = RobertaForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

# Define the emotion labels
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", 
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", 
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", 
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]


In [37]:


# Define a function for emotion prediction
def predict_emotion(comment, threshold=0.5):
    encoding = tokenizer(comment, truncation=True, padding=True, max_length=128, return_tensors="pt")
    
    model.eval()
    with torch.no_grad():
        logits = model(**encoding).logits
    
    # Convert logits to binary predictions based on the threshold
    predicted_labels = (torch.sigmoid(logits) > threshold).int()
    
    # Convert the predicted labels tensor to a string of emotions
    predicted_labels_str = ", ".join(emotion_labels[i] for i, value in enumerate(predicted_labels[0]) if value == 1)
    
    return predicted_labels_str


In [38]:

def get_youtube_comments(video_url, api_key):
    # Extract video ID from the URL
    video_id = video_url.split("v=")[1]

    # Create a YouTube API client
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

    # Get video details
    video_response = youtube.videos().list(
        part="snippet",
        id=video_id
    ).execute()

    video_title = video_response["items"][0]["snippet"]["title"]

    # Get comments
    comments = []
    nextPageToken = None

    while True:
        comment_response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=500,  # Adjust as needed
            pageToken=nextPageToken
        ).execute()

        for item in comment_response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        nextPageToken = comment_response.get("nextPageToken")

        if not nextPageToken:
            break

    return video_title, comments


In [39]:

if __name__ == "__main__":
    # Replace 'YOUR_API_KEY' with your actual YouTube API key
    api_key = 'AIzaSyAlV9EbT68jAJuo_Fk5XklaeKzN1Uxj0B0'

    # Replace 'YOUR_VIDEO_URL' with the YouTube video URL
    video_url = 'https://www.youtube.com/watch?v=OyddY7DlV58'

    video_title, comments = get_youtube_comments(video_url, api_key)

    print(f"Video Title: {video_title}")
    print(f"Total Comments: {len(comments)}")
    print("\nComments:")
    for i, comment in enumerate(comments, start=1):
        print(f"{i}. {comment}")


Video Title: Probability Part 1: Rules and Patterns: Crash Course Statistics #13
Total Comments: 143

Comments:
1. Wow
2. I stg I&#39;ve learned more from CrashCourse than I have from my community college professors. I&#39;ve gotta become a patreon, you guys have saved my butt so many times.
3. <a href="https://www.youtube.com/watch?v=OyddY7DlV58&amp;t=4">0:04</a> ADRIENE!!!! My favorite teacher!!!!
4. Omg ❤️
5. <a href="https://www.youtube.com/watch?v=OyddY7DlV58&amp;t=342">5:42</a> &quot;you like like cole sprouse&quot; what does that mean?
6. demnnn probablity was made this quick this was amazing
7. I didn&#39;t start seeing faces until this person brought it up. Now I can&#39;t stop.
8. items on the table keeps getting changed as per the topic 😁
9. 🤯 whats the probability I will drop my 6 week summer stats class? Id rather take Anatomy again😩
10. Too fast with explanations...rather too many words for few simple explanation.
11. Cole shouldn&#39;t  go to the IHOP I worked at...would

In [40]:

# Create an empty DataFrame
dfin = pd.DataFrame(columns=['Label', 'Comment'])

# Process comments and store results in the DataFrame
for i, comment in enumerate(comments, start=1):
    predicted_emotions = predict_emotion(comment)
    
    # Append results to the DataFrame
    dfin = pd.concat([dfin, pd.DataFrame({ 'Label': [predicted_emotions], 'Comment': [comment]})])

# Sort the DataFrame by 'Label'
df_sorted = dfin.sort_values(by='Label')


In [41]:

# Print or save the sorted DataFrame
print(df_sorted)
#df_sorted.to_csv("predicted_comments_new.csv", index=False)  # Save to a CSV file


       Label                                            Comment
0                                                          Cool
0                                                   Good stuff.
0              stats exam is tomorrow im currently binging this
0             I don&#39;t like all these pop culture referen...
0             Adriene looks more confident since the economi...
..       ...                                                ...
0    sadness  Here is when CrashCourse turns into CryingCourse.
0   surprise  Wow I&#39;ve never thought that I would probab...
0   surprise                              Woah double upload XD
0   surprise  After 42 seconds I figured this video will inc...
0   surprise                                                Wow

[143 rows x 2 columns]
