In [8]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_training_roberta.csv')
df.head()

Unnamed: 0,index,roberta_neg_post_body,roberta_neu_post_body,roberta_pos_post_body,roberta_compound_post_body,roberta_neg_post_title,roberta_neu_post_title,roberta_pos_post_title,roberta_compound_post_title,subreddit,post_id,post_title,post_score,post_url,post_comms_num,post_body,post_timestamp
0,1,0.171448,0.768685,0.059867,-0.111581,0.013251,0.928738,0.058012,0.044761,television,1gmlvja,watch recommend week november 08 2024,51,https://www.reddit.com/r/television/comments/1...,406,comment sort new default feel free describe sh...,2024-11-08 16:00:12
1,2,0.101025,0.739353,0.159622,0.058597,0.268124,0.694113,0.037763,-0.230361,television,1gpuzer,kiernan shipka say know go mad man despite eff...,1411,https://people.com/kiernan-shipka-says-she-kne...,74,""" I know brain body decide remember like audit...",2024-11-12 20:45:23
2,3,0.031195,0.229534,0.73927,0.708075,0.00814,0.296943,0.694917,0.686777,television,1gpkns9,"jim gaffigan end time tim walz ' snl ' "" I fee...",943,https://www.hollywoodreporter.com/tv/tv-news/j...,97,""" it expectation maya going continue entire ti...",2024-11-12 13:25:57
3,4,0.122717,0.428424,0.448859,0.326142,0.01359,0.429056,0.557354,0.543763,television,1gq3tw7,hbo series oz unforgettable character tv,62,https://www.reddit.com/r/television/comments/1...,36,ryan oreilly vern schillinger simon adebisi to...,2024-11-13 03:35:16
4,5,0.065585,0.878166,0.056249,-0.009336,0.073045,0.879932,0.047023,-0.026022,television,1gq3oj3,st denis medical series premiere discussion,21,https://www.reddit.com/r/television/comments/1...,6,st denis medical premise staff oregon understa...,2024-11-13 03:27:24


In [9]:
import torch
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 8  # Adjust this to fit within GPU memory limits

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

# Split the data into training and validation sets
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)


In [13]:
def get_hidden_states_in_batches(df, batch_size, tokenizer, model, device):
    cls_outputs = []
    for i in range(0, len(df), batch_size):
        batch_texts = [str(text) for text in df["post_body"].values[i:i + batch_size] if isinstance(text, str)]        # Tokenize the batch
        tokenized_batch = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
        tokenized_batch = {k: v.to(device) for k, v in tokenized_batch.items()}
        
        with torch.no_grad():
            outputs = model(**tokenized_batch)
            # Extract [CLS] hidden state
            cls_output = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            cls_outputs.append(cls_output)
    
    return np.concatenate(cls_outputs, axis=0)

In [14]:
x_train = get_hidden_states_in_batches(df_train, batch_size, tokenizer, model, device)
x_val = get_hidden_states_in_batches(df_val, batch_size, tokenizer, model, device)

# Labels
y_train = df_train["roberta_compound_post_body"].values
y_val = df_val["roberta_compound_post_body"].values

In [15]:
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_val shape: {x_val.shape}, y_val shape: {y_val.shape}")

x_train shape: (11504, 768), y_train shape: (11504,)
x_val shape: (2877, 768), y_val shape: (2877,)


In [16]:
reg_model = LinearRegression()
reg_model.fit(x_train, y_train)

y_pred = reg_model.predict(x_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = reg_model.score(x_val, y_val)
print(f"R^2 score: {r2}")

Mean Squared Error: 0.0767115509444193
R^2 score: 0.6717766448003887


In [17]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, mean_squared_error

def sentiment_to_class(score):
    if score <= -0.1:
        return -1
    elif score >= 0.1:
        return 1
    else:
        return 0


In [18]:
y_train_classes = np.array([sentiment_to_class(score) for score in y_train])
y_val_classes = np.array([sentiment_to_class(score) for score in y_val])

y_pred_classes = np.array([sentiment_to_class(score) for score in y_pred])

In [19]:
accuracy = accuracy_score(y_val_classes, y_pred_classes)
precision = precision_score(y_val_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_val_classes, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Accuracy: 0.7021202641640598
Precision: 0.7058560338840711
F1 Score: 0.7031876493096432
