In [None]:
import json
import os

import numpy as np
import pandas as pd
import requests
from openai import AzureOpenAI
from tqdm import tqdm

In [None]:
news_df = pd.read_csv("../../data/processed/7-30-2024_to_7-21-2025_nvda_news.csv")
stock_df = pd.read_csv("../../data/processed/7-30-2024_to_7-21-2025_nvda_stock.csv")

In [None]:
news_df.reset_index(drop=True, inplace=True)
news_df

Unnamed: 0,datetime,headline,summary,assigned_date
0,2024-07-30 14:00:00,Down Between 17% and 35% From Their 52-Week Hi...,There are plenty of different ways to invest i...,2024-07-30
1,2024-07-30 14:15:00,1 Top Artificial Intelligence (AI) Stock Billi...,Some hedge funds have been selling Nvidia and ...,2024-07-30
2,2024-07-30 14:16:15,"Stock market news today: Nasdaq sinks, Nvidia ...",A packed day of earnings and the start of the ...,2024-07-30
3,2024-07-30 14:26:00,More Big Tech Earnings Are Coming. What’s Next...,Microsoft will kick off this week’s Big Tech e...,2024-07-30
4,2024-07-30 14:54:00,"Sensata (ST) Q2 Earnings Meet Estimates, Reven...",Sensata (ST) second-quarter revenues are drive...,2024-07-30
...,...,...,...,...
11055,2025-07-21 13:05:27,Digi Power X to Raise US$15 Million in Direct ...,"Digi Power X (Nasdaq: DGXX and TSXV: DGX), an ...",2025-07-21
11056,2025-07-21 13:22:00,WeRide Teams Up With Lenovo to Launch 100% Aut...,"GUANGZHOU, China, July 21, 2025 (GLOBE NEWSWIR...",2025-07-21
11057,2025-07-21 13:39:19,Amazon Investors Search for Signs of AI Lift W...,(Bloomberg) -- Aggressive spending on artifici...,2025-07-21
11058,2025-07-21 01:30:00,Nvidia and Broadcom: Here's How These Top AI S...,Nvidia and Broadcom both have reported soaring...,2025-07-21


In [None]:
stock_df

Unnamed: 0,date,price
0,2024-07-30,103.697243
1,2024-07-31,116.983040
2,2024-08-01,109.175514
3,2024-08-02,107.236115
4,2024-08-05,100.418274
...,...,...
239,2025-07-15,170.699997
240,2025-07-16,171.369995
241,2025-07-17,173.000000
242,2025-07-18,172.410004


### Limited to max 8 news articles per day


In [None]:
news_df = (
    news_df.groupby("assigned_date", group_keys=False)
    .head(8)
    .reset_index(drop=True)
    .copy()
)
news_df.shape

(2232, 4)

In [None]:
token = os.getenv("TOKEN")
subscription = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
resource_name = os.getenv("RESOURCE_NAME")
model_deployment_name = "gpt41-mini-news-sentiment"

In [None]:
deploy_params = {"api-version": "2025-04-01-preview"}
deploy_headers = {
    "Authorization": "Bearer {}".format(token),
    "Content-Type": "application/json",
}

In [None]:
deploy_data = {
    "sku": {"name": "developertier", "capacity": 120},
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": "gpt-4.1-mini-2025-04-14.ft-9e2dec8fc72c499da71bc5668f52a1be-001",
            "version": "1",
        }
    },
}
deploy_data = json.dumps(deploy_data)

In [None]:
request_url = f"https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}"

print("Creating a new deployment...")

r = requests.put(
    request_url, params=deploy_params, headers=deploy_headers, data=deploy_data
)

print(r)
print(r.reason)
print(r.json())

Creating a new deployment...
<Response [201]>
Created
{'id': '/subscriptions/b5a4f554-5572-486f-8de8-7624c85141b4/resourceGroups/SentimentLLM-UTD-Sweden/providers/Microsoft.CognitiveServices/accounts/SentimentLLM-UTD-Sweden/deployments/gpt41-mini-news-sentiment', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'name': 'gpt41-mini-news-sentiment', 'sku': {'name': 'developertier', 'capacity': 120}, 'properties': {'model': {'format': 'OpenAI', 'name': 'gpt-4.1-mini-2025-04-14.ft-9e2dec8fc72c499da71bc5668f52a1be-001', 'version': '1'}, 'versionUpgradeOption': 'NoAutoUpgrade', 'currentCapacity': 120, 'capabilities': {'chatCompletion': 'true', 'area': 'EUR', 'responses': 'true', 'assistants': 'true'}, 'provisioningState': 'Creating', 'rateLimits': [{'key': 'request', 'renewalPeriod': 60, 'count': 120}, {'key': 'token', 'renewalPeriod': 60, 'count': 120000}]}, 'systemData': {'createdBy': 'Priyanath_Maji@bcbstx.com', 'createdByType': 'User', 'createdAt': '2025-07-23T18:20:10.3293973

In [None]:
system_prompt = {
    "role": "system",
    "content": "You are a financial sentiment classifier. Respond with one word: neutral, positive, or negative.",
}

sentiment_to_score = {"positive": 1.0, "neutral": 0.0, "negative": -1.0}
sentiment_scores, sentiment_labels = [], []

In [None]:
client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_ENDPOINT"),
    api_key=os.getenv("AZURE_API_KEY"),
    api_version="2024-12-01-preview",
)

In [None]:
for summary in tqdm(news_df["summary"].tolist(), desc="Classifying sentiment"):
    response = client.chat.completions.create(
        model=model_deployment_name,
        messages=[
            system_prompt,
            {
                "role": "user",
                "content": f"Classify the sentiment of this sentence: {summary}",
            },
        ],
        max_completion_tokens=1,
    )
    sentiment = response.choices[0].message.content.strip().lower()

    sentiment_labels.append(sentiment)
    sentiment_scores.append(sentiment_to_score.get(sentiment, 0.0))

Classifying sentiment: 100%|██████████| 2232/2232 [41:17<00:00,  1.11s/it]    


In [None]:
with open("../../models/gpt-4.1/gpt-4.1-mini-ft-news-sentiment.txt", "w") as f:
    for label in sentiment_labels:
        f.write(label + "\n")

In [None]:
news_df["sentiment_label"] = sentiment_labels
news_df["sentiment_score"] = sentiment_scores

In [None]:
news_df.head()

Unnamed: 0,datetime,headline,summary,assigned_date,sentiment_label,sentiment_score
0,2024-07-30 14:00:00,Down Between 17% and 35% From Their 52-Week Hi...,There are plenty of different ways to invest i...,2024-07-30,neutral,0.0
1,2024-07-30 14:15:00,1 Top Artificial Intelligence (AI) Stock Billi...,Some hedge funds have been selling Nvidia and ...,2024-07-30,neutral,0.0
2,2024-07-30 14:16:15,"Stock market news today: Nasdaq sinks, Nvidia ...",A packed day of earnings and the start of the ...,2024-07-30,neutral,0.0
3,2024-07-30 14:26:00,More Big Tech Earnings Are Coming. What’s Next...,Microsoft will kick off this week’s Big Tech e...,2024-07-30,neutral,0.0
4,2024-07-30 14:54:00,"Sensata (ST) Q2 Earnings Meet Estimates, Reven...",Sensata (ST) second-quarter revenues are drive...,2024-07-30,positive,1.0


In [None]:
daily_sentiment = (
    news_df.groupby("assigned_date")["sentiment_score"]
    .mean()
    .reset_index()
    .rename(columns={"sentiment_score": "avg_sentiment", "assigned_date": "date"})
)

In [None]:
daily_sentiment.head()

Unnamed: 0,date,avg_sentiment
0,2024-07-30,0.0
1,2024-07-31,0.25
2,2024-08-01,0.0
3,2024-08-02,-0.375
4,2024-08-05,-1.0


In [None]:
daily_sentiment["date"] = pd.to_datetime(daily_sentiment["date"]).dt.normalize()

full_dates = pd.date_range(
    start=daily_sentiment["date"].min(), end=daily_sentiment["date"].max()
)

# Find missing dates
missing_dates = full_dates.difference(daily_sentiment["date"])

print(f"Missing {len(missing_dates)} days with no news:")
print(missing_dates)

Missing 66 days with no news:
DatetimeIndex(['2024-08-03', '2024-08-04', '2024-08-10', '2024-08-11',
               '2024-08-17', '2024-08-18', '2024-08-24', '2024-08-25',
               '2024-08-26', '2024-08-27', '2024-08-31', '2024-09-01',
               '2024-09-02', '2024-09-07', '2024-09-08', '2024-09-09',
               '2024-09-14', '2024-09-15', '2024-09-16', '2024-09-21',
               '2024-09-22', '2024-09-28', '2024-09-29', '2024-10-05',
               '2024-10-06', '2024-10-07', '2024-10-11', '2024-10-12',
               '2024-10-13', '2024-10-14', '2024-10-19', '2024-10-20',
               '2024-10-26', '2024-10-27', '2024-11-16', '2024-11-17',
               '2024-11-18', '2024-11-19', '2025-01-04', '2025-01-05',
               '2025-03-15', '2025-03-22', '2025-03-23', '2025-03-24',
               '2025-03-29', '2025-03-30', '2025-04-05', '2025-04-06',
               '2025-04-12', '2025-04-13', '2025-04-14', '2025-05-10',
               '2025-05-17', '2025-05-18', '202

In [None]:
daily_sentiment = (
    daily_sentiment.set_index("date")
    .reindex(full_dates)
    .rename_axis("date")
    .reset_index()
)

# Fill missing sentiment scores with 0.0 (neutral)
daily_sentiment["avg_sentiment"] = daily_sentiment["avg_sentiment"].fillna(0.0)

In [None]:
daily_sentiment.head(6)

Unnamed: 0,date,avg_sentiment
0,2024-07-30,0.0
1,2024-07-31,0.25
2,2024-08-01,0.0
3,2024-08-02,-0.375
4,2024-08-03,0.0
5,2024-08-04,0.0


In [None]:
stock_df["date"] = pd.to_datetime(stock_df["date"]).dt.normalize()

In [None]:
combined_df = pd.merge(daily_sentiment, stock_df, on="date", how="left")

# Forward-fill price on non-trading days (e.g., weekends, holidays)
combined_df["price"] = combined_df["price"].ffill()

In [None]:
combined_df

Unnamed: 0,date,avg_sentiment,price
0,2024-07-30,0.000,103.697243
1,2024-07-31,0.250,116.983040
2,2024-08-01,0.000,109.175514
3,2024-08-02,-0.375,107.236115
4,2024-08-03,0.000,107.236115
...,...,...,...
352,2025-07-17,0.375,173.000000
353,2025-07-18,0.125,172.410004
354,2025-07-19,0.125,172.410004
355,2025-07-20,0.500,172.410004


In [None]:
combined_df.to_csv("../../data/processed/prices_and_sentiments.csv", index=False)