In [26]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import re
import json 
import pandas as pd
# Download necessary NLTK data (only needed once)
nltk.download('vader_lexicon')
nltk.download('stopwords')

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to clean text and remove stop words
def clean_text(text):
    # Check if text is a string and not None
    if isinstance(text, str):
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

        # Convert to lowercase
        text = text.lower()
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [w for w in words if not w in stop_words]
        text = " ".join(words)

        return text
    else:
        # Handle None or non-string values by returning an empty string or a placeholder
        return "" # or any suitable placeholder


json_file = "msft_stock_news.json"

# Open and load JSON file
with open(json_file, "r", encoding="utf-8") as file:
    try:
        data = json.load(file)  # Load JSON data
    except json.JSONDecodeError as e:
        print(f"Error loading JSON: {e}")
        data = []

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)


# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# Create a function to assign sentiment labels based on sentiment score
def assign_sentiment_label(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"


# Function to calculate sentiment score
def get_sentiment_score(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

# Calculate sentiment scores for cleaned text and store it in the 'sentiment' column
df['sentiment_score'] = df['cleaned_text'].apply(get_sentiment_score)
# Apply the function to create the 'sentiment_label' column
df['sentiment_label'] = df['sentiment_score'].apply(assign_sentiment_label)

# Ensure 'publishedDate' is datetime
df['publishedDate'] = pd.to_datetime(df['publishedDate'])

# Sort by symbol and time
df = df.sort_values(['symbol', 'publishedDate'])

# Set the index to publishedDate for rolling window
df.set_index('publishedDate', inplace=True)

df.head(2)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shekh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shekh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0_level_0,symbol,title,image,site,text,url,cleaned_text,sentiment_score,sentiment_label
publishedDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-16 17:52:00,MSFT,Microsoft launches $250 Xbox with no disc drive,https://images.financialmodelingprep.com/news/...,cnbc.com,Microsoft launches $250 Xbox console with no d...,https://www.cnbc.com/2019/04/16/microsoft-250-...,microsoft launches xbox console disc drive,0.0,neutral
2019-04-17 14:43:50,MSFT,Why Microsoft Remains A Better Buy Over Amazon,https://images.financialmodelingprep.com/news/...,seekingalpha.com,Microsoft's valuation is reasonable given its ...,https://seekingalpha.com/article/4255018-micro...,microsofts valuation reasonable given historic...,-0.0516,negative


In [None]:
# Assuming 'publishedDate' is already set as index and is datetime
# (as shown in your screenshot)

# Calculate rolling 24-hour news count per symbol
df['news_count_last_24h'] = (
    df.groupby('symbol')['title']
    .rolling('24h').count()
    .reset_index(level=0, drop=True)
)

# Average sentiment score over last 24 hours
df['average_sentiment_last_24h'] = (
    df.groupby('symbol')['sentiment_score']
    .rolling('24h').mean()
    .reset_index(level=0, drop=True)
)

# Sentiment volatility over last 24 hours
df['news_volatility_impact'] = (
    df.groupby('symbol')['sentiment_score']
    .rolling('24h').std()
    .reset_index(level=0, drop=True)
)


df = df.reset_index()

Unnamed: 0,publishedDate,symbol,title,image,site,text,url,cleaned_text,sentiment_score,sentiment_label,news_count_last_24h,average_sentiment_last_24h,news_volatility_impact
0,2019-04-16 17:52:00,MSFT,Microsoft launches $250 Xbox with no disc drive,https://images.financialmodelingprep.com/news/...,cnbc.com,Microsoft launches $250 Xbox console with no d...,https://www.cnbc.com/2019/04/16/microsoft-250-...,microsoft launches xbox console disc drive,0.0000,neutral,1.0,0.000000,
1,2019-04-17 14:43:50,MSFT,Why Microsoft Remains A Better Buy Over Amazon,https://images.financialmodelingprep.com/news/...,seekingalpha.com,Microsoft's valuation is reasonable given its ...,https://seekingalpha.com/article/4255018-micro...,microsofts valuation reasonable given historic...,-0.0516,negative,2.0,-0.025800,0.036487
2,2019-04-18 12:51:00,MSFT,Microsoft acquires Express Logic to help its p...,https://images.financialmodelingprep.com/news/...,cnbc.com,Express Logic says it has 6.2 billion deployme...,https://www.cnbc.com/2019/04/18/microsoft-acqu...,express logic says billion deployments threadx...,0.0000,neutral,2.0,-0.025800,0.036487
3,2019-04-19 10:10:09,MSFT,Can Microsoft (MSFT) Keep the Earnings Surpris...,https://images.financialmodelingprep.com/news/...,zacks.com,Microsoft (MSFT) has an impressive earnings su...,https://www.zacks.com/stock/news/389007/can-mi...,microsoft msft impressive earnings surprise hi...,0.6597,positive,2.0,0.329850,0.466478
4,2019-04-21 12:20:00,MSFT,Microsoft Makes an IoT Push With a New Acquisi...,https://images.financialmodelingprep.com/news/...,fool.com,"What is Express Logic, and why did Microsoft j...",https://www.fool.com/investing/2019/04/21/micr...,express logic microsoft buy,0.0000,neutral,1.0,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,2025-04-04 22:08:09,MSFT,Microsoft AI CEO's remarks interrupted by pro-...,https://images.financialmodelingprep.com/news/...,reuters.com,Microsoft AI CEO Mustafa Suleyman's remarks we...,https://www.reuters.com/world/us/microsoft-ai-...,microsoft ai ceo mustafa suleymans remarks int...,-0.6124,negative,16.0,0.268288,0.517354
1436,2025-04-05 08:05:00,MSFT,6 Dividend Growth Stocks I'm Buying As Tariffs...,https://images.financialmodelingprep.com/news/...,seekingalpha.com,President Trump's tariffs are not negotiating ...,https://seekingalpha.com/article/4772932-6-div...,president trumps tariffs negotiating tactics p...,-0.5267,negative,17.0,0.221524,0.536753
1437,2025-04-05 08:56:45,MSFT,Microsoft: A Defensive Buy In An Uncertain Tar...,https://images.financialmodelingprep.com/news/...,seekingalpha.com,Microsoft remains a defensive buy due to its d...,https://seekingalpha.com/article/4773187-micro...,microsoft remains defensive buy due diversifie...,0.8646,positive,17.0,0.264853,0.558040
1438,2025-04-06 07:00:00,MSFT,3 Beaten-Down Tech Stocks to Buy That Have Rai...,https://images.financialmodelingprep.com/news/...,fool.com,Technology has become the single most importan...,https://www.fool.com/investing/2025/04/06/3-be...,technology become single important sector us s...,0.2023,positive,3.0,0.180067,0.695916
