In [None]:
!pip install huggingface_hub



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import AutoTokenizer, AutoModel
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('/content/drive/MyDrive/MINI PROJECT/DATASET/augmented_dataset1.csv')
print(df.tail())

df = pd.concat([
    df[df['Label'] == 1].sample(n=300, random_state=42),
    df[df['Label'] == 0].sample(n=300, random_state=42),
    df[df['Label'] == 2].sample(n=300, random_state=42)
])

# Reset index to avoid indexing issues
df = df.reset_index(drop=True)
print(df.head())


         PID                                          Text data  Label
18052  18052  ov trudging remembe dehydrated pregame adays d...      2
18053  18053  exacerbate floated prescribes wonky combo fibr...      2
18054  18054  voice heavy terrible awake help need night its...      2
18055  18055  wonder doesn smile face hate what right can be...      2
18056  18056  redacted overshare detail center colleges room...      2
    PID                                          Text data  Label
0  2868  my to have about being in not any way and im i...      1
1  5924  for to the know but that in few of broke up wa...      1
2  3764  new being same dont this shit will so me think...      1
3  4144  for my to have once the new year there anymore...      1
4  2780  for my to have the new year anymore about but ...      1


In [4]:
token = "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"
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased", use_auth_token=token)
model = AutoModel.from_pretrained("mental/mental-bert-base-uncased", use_auth_token=token)


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def get_bert_features(text_batch, model, tokenizer):
    encoded_inputs = tokenizer.batch_encode_plus(
        text_batch,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoded_inputs['input_ids']
    attention_mask = encoded_inputs['attention_mask']

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:, 0, :].numpy()
    return features


In [6]:
batch_size = 100
features_list = []
for i in range(0, df.shape[0], batch_size):
    text_batch = df['Text data'].iloc[i:i+batch_size].tolist()
    batch_features = get_bert_features(text_batch, model, tokenizer)
    features_list.append(batch_features)

# Concatenate all features
features = np.concatenate(features_list, axis=0)

# Labels
labels = df['Label'].values


In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [8]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(x_train, y_train)

pred = lr_clf.predict(x_test)

# Print classification report
print(classification_report(y_test, pred))

# Calculate and print accuracy
accuracy = accuracy_score(y_test, pred)
print(f'Accuracy: {accuracy:.4f}')


              precision    recall  f1-score   support

           0       0.77      0.72      0.74        67
           1       0.73      0.75      0.74        60
           2       0.77      0.81      0.79        53

    accuracy                           0.76       180
   macro avg       0.76      0.76      0.76       180
weighted avg       0.76      0.76      0.76       180

Accuracy: 0.7556
