# Enhanced Sentiment Analysis: Outperforming NLTK, FinBERT, and TextBlob with Hybrid Word Embeddings and VADER Integration

In [199]:
import pandas as pd

# Load financial news dataset from CSV file
df = pd.read_csv('cnbc_headlines.csv')

# Drop rows where 'content' is NaN
df.dropna(subset=['Headlines'], inplace=True)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Headlines,Time,Description
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying..."
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin..."
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co..."
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be..."
5,Wall Street delivered the 'kind of pullback I'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies..."


In [200]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)       # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # Remove special characters
    text = text.lower()                       # Lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Assuming the dataset has a column named 'content' for analysis
df['Cleaned_Content'] = df['Headlines'].apply(preprocess_text)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simranjeetsingh1497/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Headlines,Time,Description,Cleaned_Content
0,Jim Cramer: A better way to invest in the Covi...,"7:51 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer recommended buying...",jim cramer better way invest covid vaccine gol...
1,Cramer's lightning round: I would own Teradyne,"7:33 PM ET Fri, 17 July 2020","""Mad Money"" host Jim Cramer rings the lightnin...",cramers lightning round would teradyne
3,"Cramer's week ahead: Big week for earnings, ev...","7:25 PM ET Fri, 17 July 2020","""We'll pay more for the earnings of the non-Co...",cramers week ahead big week earnings even bigg...
4,IQ Capital CEO Keith Bliss says tech and healt...,"4:24 PM ET Fri, 17 July 2020","Keith Bliss, IQ Capital CEO, joins ""Closing Be...",iq capital ceo keith bliss says tech healthcar...
5,Wall Street delivered the 'kind of pullback I'...,"7:36 PM ET Thu, 16 July 2020","""Look for the stocks of high-quality companies...",wall street delivered kind pullback ive waitin...


In [212]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import pos_tag
from nltk.tokenize import word_tokenize



# Extend the custom lexicon with additional domain-specific words
custom_lexicon = {
    'good': 1.0,
    'great': 1.5,
    'excellent': 1.5,
    'bad': -1.0,
    'poor': -1.5,
    'big': 0.5,
    'week': 0.0,
    'earnings': 0.0,
    'stay long': 1.0,
    'turn': 0.5,
    'turning point': 1.0,
    'concern': -0.5,
    'uncertainty': -1.0,
    'buy':1.0,
    'lands':1.0,
    'invested':1.0,
    'bullish':1.0,
    'unusual':-0.5,
    'authenticated':0.5,
    'Conagra':0.1,
    'revamping':0.1,
    'money':0.5,
    'unprecedented':-0.5,
    'using': 0.5,
    'technology': 0.5,
    'effective': 1.0,
    'successful': 1.5,
    'beneficial': 1.5,
    'coronavirus': 0.0,  # Neutral in this context
    'contact tracing': 1.0,
    'surged':1.0,
    'restrictions':-0.5,
    'recovery':0.5,
    'thumbs':0.4,
    'mirror':0.5,
    'acceleration':0.5,
    'highs':0.1,
    'higher':0.4,
    'wait':0.3,
    'bull':0.1,
    'golden':0.1,
    'normal':0.1,
    'reacts':0.1,
    'despite':-0.1,
    'overbought':0.1,
    'cautions':0.5,
    "can't": -0.9, 
    "without":-0.1,
    'helped':0.1,
    'record':0.1,
    'surpassed':0.3,
    'diverse':0.1,
    'behind':-0.1,
    'fits':0.3,
    'rotating':-0.3,
    'leading':0.8,
    'rebound':0.3,
    'bailed':-0.1,
    'out':-0.3,
    'rough':-0.5,
    'deal':0.2,
    'minimal':0.1,
    'most':0.09,
    'legit':0.1,
    'lasting':0.1,
    'prefer':0.01,
    'latest':0.1,
    'lead':0.1,
    'powering':0.1,
    'upside':0.1,
    'more':0.01,
}

# Function to extract adjectives and verbs
def extract_adj_verbs(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)

    adjectives = [word for word, tag in tagged_words if tag in ('JJ', 'JJR', 'JJS')]
    verbs = [word for word, tag in tagged_words if tag.startswith('VB')]
    
    return adjectives, verbs
    
# Apply the function to extract adjectives and verbs
df['Adjectives'], df['Verbs'] = zip(*df['Cleaned_Content'].apply(extract_adj_verbs))

# Display the DataFrame with the new sentiment column
df[['Headlines', 'Cleaned_Content', 'Adjectives', 'Verbs']]

Unnamed: 0,Headlines,Cleaned_Content,Adjectives,Verbs
0,Jim Cramer: A better way to invest in the Covi...,jim cramer better way invest covid vaccine gol...,[invest],[]
1,Cramer's lightning round: I would own Teradyne,cramers lightning round would teradyne,[],"[lightning, teradyne]"
3,"Cramer's week ahead: Big week for earnings, ev...",cramers week ahead big week earnings even bigg...,"[big, bigger]",[]
4,IQ Capital CEO Keith Bliss says tech and healt...,iq capital ceo keith bliss says tech healthcar...,[iq],[says]
5,Wall Street delivered the 'kind of pullback I'...,wall street delivered kind pullback ive waitin...,"[wall, ive]","[delivered, waiting, says]"
...,...,...,...,...
3075,Markets lack Christmas cheer,markets lack christmas cheer,[],[lack]
3076,Cramer Remix: The biggest mistake you can make...,cramer remix biggest mistake make taxes stock ...,[biggest],[]
3077,Cramer says owning too many stocks and too lit...,cramer says owning many stocks little cash set...,"[many, little]","[says, owning, set]"
3078,Cramer: I helped investors through the 2010 fl...,cramer helped investors flash crash following ...,"[flash, key]","[helped, following]"


In [213]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Define sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def enhanced_sentiment_analysis(text, adjectives, verbs, finbert_sentiment):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)

    positive_score = 0
    negative_score = 0
    is_negation = False

    # Check for negation
    for word in words:
        if word.lower() == "not":
            is_negation = True
            continue

        word_score = custom_lexicon.get(word.lower(), 0)

        if is_negation:
            word_score = -word_score
            is_negation = False

        if word_score > 0:
            positive_score += word_score
        elif word_score < 0:
            negative_score += abs(word_score)

    # Add scores based on adjectives and verbs
    for adj in adjectives:
        if adj in custom_lexicon:
            positive_score += custom_lexicon[adj] if custom_lexicon[adj] > 0 else 0
            negative_score += abs(custom_lexicon[adj]) if custom_lexicon[adj] < 0 else 0

    for verb in verbs:
        if verb in custom_lexicon:
            positive_score += custom_lexicon[verb] if custom_lexicon[verb] > 0 else 0
            negative_score += abs(custom_lexicon[verb]) if custom_lexicon[verb] < 0 else 0

    # Determine sentiment based on scores and FinBERT sentiment
    final_sentiment = finbert_sentiment

    # Combine FinBERT sentiment with adjective/verb analysis
    if positive_score > negative_score:
        final_sentiment = "Positive"
    elif negative_score > positive_score:
        final_sentiment = "Negative"

    return final_sentiment

# Extract adjectives and verbs
df['Adjectives'], df['Verbs'] = zip(*df['Cleaned_Content'].apply(extract_adj_verbs))

# Apply FinBERT sentiment analysis
df['FinBERT_Sentiment'] = df['Cleaned_Content'].apply(lambda x: nlp(x)[0]['label'])

# Apply the enhanced sentiment analysis function to each row
df['Enhanced_NLTK_Sentiment'] = df.apply(
    lambda row: enhanced_sentiment_analysis(row['Cleaned_Content'], row['Adjectives'], row['Verbs'], row['FinBERT_Sentiment']),
    axis=1
)

# Display the DataFrame with the new sentiment columns
df[['Headlines', 'Cleaned_Content', 'FinBERT_Sentiment', 'Enhanced_NLTK_Sentiment']]


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0,Headlines,Cleaned_Content,FinBERT_Sentiment,Enhanced_NLTK_Sentiment
0,Jim Cramer: A better way to invest in the Covi...,jim cramer better way invest covid vaccine gol...,Positive,Positive
1,Cramer's lightning round: I would own Teradyne,cramers lightning round would teradyne,Neutral,Neutral
3,"Cramer's week ahead: Big week for earnings, ev...",cramers week ahead big week earnings even bigg...,Neutral,Positive
4,IQ Capital CEO Keith Bliss says tech and healt...,iq capital ceo keith bliss says tech healthcar...,Neutral,Neutral
5,Wall Street delivered the 'kind of pullback I'...,wall street delivered kind pullback ive waitin...,Neutral,Neutral
...,...,...,...,...
3075,Markets lack Christmas cheer,markets lack christmas cheer,Negative,Negative
3076,Cramer Remix: The biggest mistake you can make...,cramer remix biggest mistake make taxes stock ...,Negative,Negative
3077,Cramer says owning too many stocks and too lit...,cramer says owning many stocks little cash set...,Negative,Negative
3078,Cramer: I helped investors through the 2010 fl...,cramer helped investors flash crash following ...,Positive,Positive


In [214]:
df['Enhanced_NLTK_Sentiment'].value_counts()

Enhanced_NLTK_Sentiment
Neutral     1224
Positive    1133
Negative     443
Name: count, dtype: int64

In [224]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download VADER lexicon
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Load pre-trained Word2Vec or GloVe model
# For example, load GloVe vectors (download from Stanford GloVe site if needed)
word_vectors = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False, no_header=True)
# Replace 'path/to/glove.6B.100d.txt' with actual path to the GloVe file

# Extract words from VADER lexicon and filter those in embedding model
vader_lexicon = sia.lexicon
vader_words = {word: word_vectors[word] for word in vader_lexicon if word in word_vectors}

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/simranjeetsingh1497/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
## Extract words from VADER lexicon and filter those in embedding model
vader_lexicon = sia.lexicon
vader_words = {word: word_vectors[word] for word in vader_lexicon if word in word_vectors}

# Pre-calculate matrix of VADER word embeddings for efficient similarity calculations
vader_matrix = np.array(list(vader_words.values()))
vader_word_list = list(vader_words.keys())

# Caching dictionary for OOV words
oov_cache = {}

# Function to find the closest VADER word using cosine similarity
def get_closest_vader_score(word):
    if word in oov_cache:
        return oov_cache[word]

    if word in word_vectors:
        word_vector = word_vectors[word].reshape(1, -1)
        # Calculate cosine similarity for the word vector against all VADER words in one operation
        similarities = cosine_similarity(word_vector, vader_matrix).flatten()
        max_index = np.argmax(similarities)
        closest_word = vader_word_list[max_index]
        score = vader_lexicon[closest_word]
    else:
        score = 0  # Neutral score for words without embeddings

    # Cache result for the word to avoid re-calculation
    oov_cache[word] = score
    return score


# Find new sentiments for rows labeled as Neutral
neutral_rows = df[df['Enhanced_NLTK_Sentiment'] == 'Neutral'].index

for index in neutral_rows:
    words = df.loc[index, 'Cleaned_Content'].split()
    scores = [get_closest_vader_score(word) for word in words]
    avg_score = np.mean(scores)  # Average score of words

    # Classify based on score
    if avg_score >= 0.05:
        df.loc[index, 'Enhanced_NLTK_Sentiment'] = 'Positive'
    elif avg_score <= -0.05:
        df.loc[index, 'Enhanced_NLTK_Sentiment'] = 'Negative'
    else:
        df.loc[index, 'Enhanced_NLTK_Sentiment'] = 'Neutral'

In [242]:
df[df['Enhanced_NLTK_Sentiment']=='Negative'][['Headlines', 'Enhanced_NLTK_Sentiment']].iloc[4]['Headlines']

'Charts suggest the S&P 500 climb will stall out at the end of July, Jim Cramer warns'

In [230]:
df['Enhanced_NLTK_Sentiment'].value_counts()

Enhanced_NLTK_Sentiment
Positive    2124
Negative     616
Neutral       60
Name: count, dtype: int64