In [76]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [77]:
stress1 = pd.read_csv(
    "MindSync_Data/stress_dataset_twitter1.csv", 
    sep=';',            # semicolon separator
    usecols=[0,1],      # only first 2 columns (Text, Label)
    encoding='utf-8'
)

In [78]:
stress2 = pd.read_csv(
    "MindSync_Data/stress_dataset_twitter2.csv", 
    sep=';', 
    usecols=[0,1], 
    encoding='utf-8'
)

In [79]:
goemotions_cols = pd.read_csv("MindSync_Data/go_emotions_dataset.csv", nrows=5)
print(goemotions_cols.columns)

Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')


In [80]:
goemotions = pd.read_csv("MindSync_Data/go_emotions_dataset.csv", usecols=['text'] + [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
], encoding='utf-8')

In [81]:
stress_df = pd.concat([stress1, stress2], ignore_index=True)

In [82]:
print(stress_df.head())
print(stress_df.columns)

                                                text  label hashtags
0   speak-no-evil monkey Can I Be Honest With You...    1.0      NaN
1  Frau Goebbels early signs of psychosis psychot...    1.0      NaN
2  A lot of work and unfulfilled tasks plunge you...    1.0      NaN
3  Private health insurance delivers value for yo...    1.0      NaN
4  XpertOnline offers you the convenience of view...    1.0      NaN
Index(['text', 'label', 'hashtags'], dtype='object')


In [83]:
# Encode 'label' column (convert text labels to numbers)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
stress_df['label'] = le.fit_transform(stress_df['label'])

print("Encoded labels:", le.classes_)
print(stress_df.head())


Encoded labels: [ 0.  1. nan]
                                                text  label hashtags
0   speak-no-evil monkey Can I Be Honest With You...      1      NaN
1  Frau Goebbels early signs of psychosis psychot...      1      NaN
2  A lot of work and unfulfilled tasks plunge you...      1      NaN
3  Private health insurance delivers value for yo...      1      NaN
4  XpertOnline offers you the convenience of view...      1      NaN


In [84]:
print(stress_df.columns)


Index(['text', 'label', 'hashtags'], dtype='object')


In [85]:
# Agar column name 'text' hai
stress_df['text'] = stress_df['text'].str.lower()
stress_df['text'] = stress_df['text'].str.replace(r'http\S+|www\S+','', regex=True)
stress_df['text'] = stress_df['text'].str.replace(r'[^\w\s]', '', regex=True)
stress_df['text'] = stress_df['text'].str.strip()


In [86]:
# Lowercase
stress_df['text'] = stress_df['text'].str.lower()

# Remove URLs
stress_df['text'] = stress_df['text'].str.replace(r'http\S+|www\S+','', regex=True)

# Remove punctuation
stress_df['text'] = stress_df['text'].str.replace(r'[^\w\s]', '', regex=True)

# Strip extra spaces
stress_df['text'] = stress_df['text'].str.strip()

# Check
print(stress_df.head())


                                                text  label hashtags
0  speaknoevil monkey can i be honest with you gl...      1      NaN
1  frau goebbels early signs of psychosis psychot...      1      NaN
2  a lot of work and unfulfilled tasks plunge you...      1      NaN
3  private health insurance delivers value for yo...      1      NaN
4  xpertonline offers you the convenience of view...      1      NaN


In [87]:
# Lowercase
goemotions['text'] = goemotions['text'].str.lower()

# Remove URLs
goemotions['text'] = goemotions['text'].str.replace(r'http\S+|www\S+','', regex=True)

# Remove punctuation
goemotions['text'] = goemotions['text'].str.replace(r'[^\w\s]', '', regex=True)

# Strip extra spaces
goemotions['text'] = goemotions['text'].str.strip()

# Check
print(goemotions.head())


                                                text  admiration  amusement  \
0                                     that game hurt           0          0   
1  sexuality shouldnt be a grouping category it m...           0          0   
2         you do right if you dont care then fuck em           0          0   
3                                  man i love reddit           0          0   
4    name was nowhere near them he was by the falcon           0          0   

   anger  annoyance  approval  caring  confusion  curiosity  desire  ...  \
0      0          0         0       0          0          0       0  ...   
1      0          0         0       0          0          0       0  ...   
2      0          0         0       0          0          0       0  ...   
3      0          0         0       0          0          0       0  ...   
4      0          0         0       0          0          0       0  ...   

   love  nervousness  optimism  pride  realization  relief  remorse 

In [88]:
from sklearn.model_selection import train_test_split

# Stress dataset (binary classification)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    stress_df['text'], stress_df['label'], test_size=0.2, random_state=42
)

# GoEmotions dataset (multi-label classification)
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
    goemotions['text'], goemotions.drop(columns=['text']), test_size=0.2, random_state=42
)


In [89]:
!pip install transformers torch -q
from transformers import AutoTokenizer

# BERT tokenizer load karo
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [90]:
sample = X_train_s.iloc[0]
encoded = tokenizer(
    sample,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

print(sample)
print(encoded)


9 surprising symptoms that indicate youre more stressed out than you think health stress
{'input_ids': tensor([[  101,  1023, 11341,  8030,  2008,  5769,  2115,  2063,  2062, 13233,
          2041,  2084,  2017,  2228,  2740,  6911,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0

In [91]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.values if labels is not None else None
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        if self.labels is not None:
            label = torch.tensor(self.labels[idx], dtype=torch.float if len(self.labels.shape) > 1 else torch.long)
            item['labels'] = label
        return item


In [92]:
# Stress dataset
train_dataset_s = TextDataset(X_train_s, y_train_s, tokenizer)
test_dataset_s = TextDataset(X_test_s, y_test_s, tokenizer)

train_loader_s = DataLoader(train_dataset_s, batch_size=16, shuffle=True)
test_loader_s = DataLoader(test_dataset_s, batch_size=16)

# GoEmotions dataset
train_dataset_g = TextDataset(X_train_g, y_train_g, tokenizer)
test_dataset_g = TextDataset(X_test_g, y_test_g, tokenizer)

train_loader_g = DataLoader(train_dataset_g, batch_size=16, shuffle=True)
test_loader_g = DataLoader(test_dataset_g, batch_size=16)


In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Check pandas version
print("Pandas version:", pd.__version__)


Pandas version: 1.5.3


In [94]:
# Load stress dataset 1
stress1 = pd.read_csv("MindSync_Data/stress_dataset_twitter1.csv", sep=';', usecols=[0,1], encoding='utf-8')

# Load stress dataset 2
stress2 = pd.read_csv("MindSync_Data/stress_dataset_twitter2.csv", sep=';', usecols=[0,1], encoding='utf-8')

# Combine both stress datasets
stress_df = pd.concat([stress1, stress2], ignore_index=True)

# Check first few rows
print(stress_df.head())
print("Stress dataset shape:", stress_df.shape)


                                                text  label hashtags
0   speak-no-evil monkey Can I Be Honest With You...    1.0      NaN
1  Frau Goebbels early signs of psychosis psychot...    1.0      NaN
2  A lot of work and unfulfilled tasks plunge you...    1.0      NaN
3  Private health insurance delivers value for yo...    1.0      NaN
4  XpertOnline offers you the convenience of view...    1.0      NaN
Stress dataset shape: (10951, 3)


In [95]:
goemotions = pd.read_csv(
    "MindSync_Data/go_emotions_dataset.csv",
    usecols=['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
             'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
             'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
             'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
             'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
    encoding='utf-8'
)

print(goemotions.head())
print("GoEmotions shape:", goemotions.shape)


                                                text  admiration  amusement  \
0                                    That game hurt.           0          0   
1   >sexuality shouldn’t be a grouping category I...           0          0   
2     You do right, if you don't care then fuck 'em!           0          0   
3                                 Man I love reddit.           0          0   
4  [NAME] was nowhere near them, he was by the Fa...           0          0   

   anger  annoyance  approval  caring  confusion  curiosity  desire  ...  \
0      0          0         0       0          0          0       0  ...   
1      0          0         0       0          0          0       0  ...   
2      0          0         0       0          0          0       0  ...   
3      0          0         0       0          0          0       0  ...   
4      0          0         0       0          0          0       0  ...   

   love  nervousness  optimism  pride  realization  relief  remorse 

In [96]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'[^\w\s]', '', text)        # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()   # remove extra spaces
    return text

# Apply cleaning
stress_df.columns = ['text', 'label', 'hashtags']
stress_df['text'] = stress_df['text'].apply(clean_text)
goemotions['text'] = goemotions['text'].apply(clean_text)


print("✅ Text cleaning done!")


✅ Text cleaning done!


In [97]:
print(stress_df.columns)
print(stress_df.head())


Index(['text', 'label', 'hashtags'], dtype='object')
                                                text  label hashtags
0  speaknoevil monkey can i be honest with you gl...    1.0      NaN
1  frau goebbels early signs of psychosis psychot...    1.0      NaN
2  a lot of work and unfulfilled tasks plunge you...    1.0      NaN
3  private health insurance delivers value for yo...    1.0      NaN
4  xpertonline offers you the convenience of view...    1.0      NaN


In [98]:
stress_df.to_csv("MindSync_Data/stress_clean.csv", index=False)
goemotions.to_csv("MindSync_Data/goemotions_clean.csv", index=False)
print("Cleaned files saved successfully ✅")


Cleaned files saved successfully ✅


In [99]:
print(stress_df.columns)
print(goemotions.columns)


Index(['text', 'label', 'hashtags'], dtype='object')
Index(['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')


In [100]:
# Convert GoEmotions one-hot to single label
goemotions['label'] = goemotions.drop('text', axis=1).idxmax(axis=1)
goemotions = goemotions[['text', 'label']]  # keep only necessary columns


In [101]:
combined_df = pd.concat([stress_df[['text', 'label']], goemotions], ignore_index=True)


In [102]:
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Show result
print("Combined dataset shape:", combined_df.shape)
combined_df.head()


Combined dataset shape: (222176, 2)


Unnamed: 0,text,label
0,bad news dude youre almost 100 years past that,disappointment
1,no it makes it better,disapproval
2,except that teenagers do give half a damn abou...,neutral
3,whats not true exactly that population differe...,neutral
4,just report his comments let the mods take car...,approval


In [103]:
combined_df['label'].value_counts()


neutral           55298
admiration        20542
approval          15530
annoyance         11929
disapproval        8917
amusement          8862
gratitude          8437
anger              7956
curiosity          7707
disappointment     6769
confusion          6600
love               5310
caring             5147
realization        5125
joy                5120
optimism           4994
excitement         4375
sadness            3863
surprise           3472
disgust            3420
desire             3002
fear               2514
embarrassment      1720
remorse            1648
1.0                1268
nervousness         946
relief              814
0.0                 783
pride               714
grief               494
Name: label, dtype: int64

In [104]:
# List of valid emotion labels from your dataset
valid_labels = [
    'neutral', 'admiration', 'approval', 'annoyance', 'disapproval',
    'amusement', 'gratitude', 'anger', 'curiosity', 'disappointment',
    'confusion', 'love', 'caring', 'realization', 'joy', 'optimism',
    'excitement', 'sadness', 'surprise', 'disgust', 'desire',
    'fear', 'embarrassment', 'remorse', 'relief', 'pride', 'grief'
]

# Keep only these labels
combined_df = combined_df[combined_df['label'].isin(valid_labels)]


In [105]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    combined_df,
    test_size=0.2,
    random_state=42,
    stratify=combined_df['label']  # safe now
)

print("Training set size:", len(train_df))
print("Test set size:", len(test_df))


Training set size: 168223
Test set size: 42056


In [106]:
pip install transformers datasets torch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [107]:
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader


In [108]:
# Load pretrained tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Example tokenization
example = tokenizer("I am feeling happy today!", padding='max_length', truncation=True, max_length=64)
print(example)


{'input_ids': [101, 1045, 2572, 3110, 3407, 2651, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [109]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [110]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels = le.fit_transform(train_df['label'])
test_labels = le.transform(test_df['label'])


In [111]:
train_dataset = EmotionDataset(
    texts=train_df['text'].tolist(),
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=64
)

test_dataset = EmotionDataset(
    texts=test_df['text'].tolist(),
    labels=test_labels,
    tokenizer=tokenizer,
    max_len=64
)


In [112]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [113]:
from transformers import DistilBertForSequenceClassification, AdamW
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import torch


In [114]:
# Number of emotion classes
num_labels = len(le.classes_)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [115]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()
epochs = 3


In [None]:
import os

# --- Resume logic ---
start_epoch = 0
if os.path.exists("model_epoch_2.pt"):  # agar checkpoint file exist karti hai
    model.load_state_dict(torch.load("model_epoch_2.pt"))  # model weights load
    optimizer.load_state_dict(torch.load("optimizer_epoch_2.pt"))  # optimizer state load
    start_epoch = 2  # next epoch se resume

for epoch in range(start_epoch, epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    # --- Save checkpoint at end of each epoch ---
    torch.save(model.state_dict(), f"model_epoch_{epoch+1}.pt")
    torch.save(optimizer.state_dict(), f"optimizer_epoch_{epoch+1}.pt")
    print(f"Checkpoint saved for epoch {epoch+1}")


In [None]:
from sklearn.metrics import classification_report

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Print metrics
print(classification_report(true_labels, predictions, target_names=le.classes_))


NameError: name 'model' is not defined