In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../Data/raw/df_oilnews.csv")
df.head()

Unnamed: 0,title,Date,excerpt
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...


In [3]:
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

config_path = '../Data/Models/crude_bert_config.json'
model_path = "../Data/Models/crude_bert_model.bin"

config = AutoConfig.from_pretrained(config_path)
model = AutoModelForSequenceClassification.from_config(config)

state_dict = torch.load(model_path)
state_dict.pop("bert.embeddings.position_ids", None)
model.load_state_dict(state_dict, strict=False)

model.eval()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse


In [10]:
def predict(df_text, batch_size=16):
    df_text['sentiment'] = None
    model.eval()
    texts = df_text['title'].tolist()

    labels = ['positive', 'negative', 'neutral']
    for start_idx in range(0, len(texts), batch_size):
        batch_texts = texts[start_idx:start_idx+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, max_length=64, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
            for j, prob in enumerate(probs):
                pred_idx = prob.argmax()
                pred = labels[pred_idx]
                df_text.at[start_idx + j, "sentiment"] = pred

predict(df)
df.head()


Unnamed: 0,title,Date,excerpt,sentiment
0,April Price Crash Dragged Saudi Arabia’s Oil R...,2025-06-25,Saudi Arabia’s revenues from oil exports crash...,positive
1,Giant Leviathan Gas Field Offshore Israel Resu...,2025-06-25,The massive Leviathan gas field offshore Israe...,negative
2,China and India Cut Imports of Lower-Quality C...,2025-06-25,The world’s biggest and second-biggest coal im...,negative
3,Iran-Israel War Prompts China to Reconsider Ru...,2025-06-25,The war between Israel and Iran has spark worr...,positive
4,EU Set to Change Subsidy Rules for Energy Costs,2025-06-25,National governments in the EU would soon be a...,negative


In [None]:
def predict(df_text, batch_size=16):
    df_text['sentiment'] = None
    df_text['score'] = 0.0
    model.eval()
    texts = df_text['title'].tolist()

    labels = ['positive', 'negative', 'neutral']
    for start_idx in range(0, len(texts), batch_size):
        batch_texts = texts[start_idx:start_idx+batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            max_length=64,
            padding=True
        )
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
            for j, prob in enumerate(probs):
                pred_idx = prob.argmax()
                df_text.at[start_idx + j, "sentiment"] = labels[pred_idx]
                df_text.at[start_idx + j, "score"] = float(prob[pred_idx])

predict(df)
print(df.head())


                                               title        Date  \
0  April Price Crash Dragged Saudi Arabia’s Oil R...  2025-06-25   
1  Giant Leviathan Gas Field Offshore Israel Resu...  2025-06-25   
2  China and India Cut Imports of Lower-Quality C...  2025-06-25   
3  Iran-Israel War Prompts China to Reconsider Ru...  2025-06-25   
4    EU Set to Change Subsidy Rules for Energy Costs  2025-06-25   

                                             excerpt sentiment     score  
0  Saudi Arabia’s revenues from oil exports crash...  positive  0.974808  
1  The massive Leviathan gas field offshore Israe...  negative  0.995840  
2  The world’s biggest and second-biggest coal im...  negative  0.990727  
3  The war between Israel and Iran has spark worr...  positive  0.813780  
4  National governments in the EU would soon be a...  negative  0.895602  


In [14]:
counts = df['sentiment'].value_counts()
counts

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,11816
negative,11297
neutral,307


In [11]:
df.to_csv('df_sentiment.csv', index=False)

In [None]:
sent = (
    pd.read_csv('../Data/raw/df_sentiment.csv', parse_dates=['Date'])
)

mapping = {'positive':  1,
           'neutral':   0,
           'negative': -1}

sent['score_num'] = sent['sentiment'].map(mapping)

daily_avg = (
    sent
    .groupby('Date')['score_num']
    .mean()
    .reset_index(name='avg_score')
)

daily_avg.head()
