In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
from datetime import datetime, timedelta
import math
import json

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sqlalchemy import create_engine
from urllib.parse import quote_plus
import importlib
import pathlib

from sutime import SUTime
from typing import *

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from dotenv import load_dotenv
import os

[nltk_data] Downloading package stopwords to /home/juval/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
#sudo apt -q install maven
pom_path = pathlib.Path(importlib.util.find_spec("sutime").origin).parent / "pom.xml"
!mvn dependency:copy-dependencies -DoutputDirectory=./jars -f {pom_path}

[[1;34mINFO[m] Scanning for projects...
Downloading from central: https://repo.maven.apache.org/maven2/org/apache/maven/plugins/maven-compiler-plugin/3.8.1/maven-compiler-plugin-3.8.1.pom
Downloaded from central: https://repo.maven.apache.org/maven2/org/apache/maven/plugins/maven-compiler-plugin/3.8.1/maven-compiler-plugin-3.8.1.pom (12 kB at 16 kB/s)
Downloading from central: https://repo.maven.apache.org/maven2/org/apache/maven/plugins/maven-plugins/33/maven-plugins-33.pom
Downloaded from central: https://repo.maven.apache.org/maven2/org/apache/maven/plugins/maven-plugins/33/maven-plugins-33.pom (11 kB at 73 kB/s)
Downloading from central: https://repo.maven.apache.org/maven2/org/apache/maven/maven-parent/33/maven-parent-33.pom
Downloaded from central: https://repo.maven.apache.org/maven2/org/apache/maven/maven-parent/33/maven-parent-33.pom (44 kB at 225 kB/s)
Downloading from central: https://repo.maven.apache.org/maven2/org/apache/apache/21/apache-21.pom
Downloaded from central: 

In [12]:
path_to_env = '../.env'
load_dotenv(dotenv_path=path_to_env)
DB_PASSWORD = os.environ.get('DB_PASSWORD')
DB_DOMAIN = os.environ.get('DB_DOMAIN')

## Time Tagging Functions:

In [13]:
sutime = SUTime(mark_time_ranges=True, include_range=True)
current_dt = datetime.now()
#current_dt = datetime.strptime('04-29-2023', "%m-%d-%Y")

def discard(txt1, txt2=""): # for DEBUGGING
    #print("<<", txt1, txt2, ">>")
    #print()
    return None

def extract_temp_info(sentence: str, timestamp: str):
    # parse the sentence along with its parent articles publication date using SUTime and convert the output to a JSON object
    parsed_temp_info = json.loads(json.dumps(sutime.parse(sentence, timestamp)))
    # check if any temporal information was found
    if len(parsed_temp_info) == 0:
        return discard(parsed_temp_info, "no time expressions found")
    return parsed_temp_info

def convert_timex_to_datetime(timex: str):
    timex = timex.replace('T', ' ')
    timex = timex.replace('X', '5') # for a decade, take the middle of it -> year ...5

    # replace season codes with approximate dates
    timex = timex.replace('FA', '11-06')
    timex = timex.replace('SU', '08-06')
    timex = timex.replace('SP', '05-05')
    timex = timex.replace('WI', '02-04')

    # try converting using each defined date format
    date_formats = [
        '%Y-%m-%d',
        '%Y-%m',
        '%Y-W%W-%w',
        '%Y-W%W',
        '%Y',
        '%Y-%j',
    ]
    for date_format in date_formats:
        try:
            return datetime.strptime(timex, date_format)
        except ValueError:
            continue

    # if conversion failed for all formats
    return discard(timex, "could not convert to datetime")


def extract_datetime(temporal_info: Dict):
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    text_info = temporal_info.get('text', "")

    if text_info.lower() in days_of_week: # references to weekdays are discarded bc. SUTime can't handle them well
        return discard(temporal_info, "reference to weekday")

    temporal_type = temporal_info.get('type', "")
    timex_value = temporal_info.get('timex-value', "")

    if timex_value:
        if temporal_type == "DATE":
            parsed_datetime = convert_timex_to_datetime(timex_value)
            if not parsed_datetime:
                return discard(temporal_info, "timex-parsing failed")
            return parsed_datetime
        else:
            return discard(temporal_info, "timex-type not supported")
    else:
        return discard(temporal_info, "timex detected but not extracted")

def collect_sentences_with_temp_exp(sentence_timestamp_pairs: List[Tuple[str, str]]):
    # store sentences with their referenced maximum datetime
    sentence_datetime_pairs = {}
    for sentence, timestamp in sentence_timestamp_pairs:
        temporal_info_list = extract_temp_info(sentence, timestamp) # sentence can contain multiple TIMEXes

        if not temporal_info_list: # if no temporal information is found, skip to the next iteration
            continue

        # extract datetime values from the temporal information
        datetime_values = [extract_datetime(temporal_info) for temporal_info in temporal_info_list]
        # find the maximum datetime value among the extracted datetime values
        max_datetime_value = max(filter(None, datetime_values), default=None)
        # validate the maximum datetime value before adding it to the dictionary
        if (not max_datetime_value or
            max_datetime_value <= current_dt or
            max_datetime_value.year - current_dt.year > 200):
            continue

        print("SUCCESS", max_datetime_value)
        sentence_datetime_pairs[sentence] = max_datetime_value

    return sentence_datetime_pairs

[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Registering annotator sutime with class edu.stanford.nlp.time.TimeAnnotator
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger ... done [2.1 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.sequences.SeqClassifierFlags - sutime.includeRange=true
[main] INFO edu.stanford.nlp.sequences.SeqClassifierFlags - Unknown property: |sutime.includeRange|
[main] INFO edu.stanford.nlp.sequences.SeqClassifierFlags - sutime.language=english
[main] INFO edu.stanford.nlp.sequ

## Set Up Database Connections:

In [14]:
def conn_to_db(db):
  password = quote_plus(DB_PASSWORD)
  db_url = f"mysql+mysqldb://root:{password}@{DB_DOMAIN}:2306/{db}"
  engine = create_engine(db_url)
  return engine

def download_df(db, table):
  return pd.read_sql_table(table, conn_to_db(db))

def upload_df(db, table, df):
  df.to_sql(table, conn_to_db(db), if_exists = 'replace')

def append_to_db(db, table, df):
  df.to_sql(table, conn_to_db(db), if_exists = 'append')

## Set Constants For Postprocessing:

In [15]:
EXTRACTED_KEYWORD_NUM = 6
DUPLICATE_SIMILARITY_THRESHOLD = 0.7

## Load Sentences:

In [16]:
query = "tesla"
preds = download_df("backend", query + "_positives")
preds = preds.loc[:, ["sentence", "timestamp", "link", "num_duplicates"]] # only keep those columns

## Cluster With BERTopic:

In [17]:
def init_bertopic():
    sent_model = SentenceTransformer('all-MiniLM-L6-v2')
    vectorizer_model = CountVectorizer(stop_words="english")
    bertopic_model = BERTopic(
                              embedding_model=sent_model,
                              vectorizer_model=vectorizer_model,
                              top_n_words=10,
                              n_gram_range=(1,1),
                              min_topic_size=7,
                             )
                              #nr_topics=100
    return bertopic_model, sent_model

def fit_bertopic(topic_model, sentence_model, sentences):
    embeddings = sentence_model.encode(sentences)
    topics, probabilities = topic_model.fit_transform(sentences, embeddings)
    return topics, embeddings

def show_topic_info(model):
    freq = model.get_topic_info()
    return freq

def output_topic_contents(model, sentences, topics):
    pd.set_option('display.max_columns', None) # show all columns
    pd.set_option('display.width', None) # set the width of the display to be unlimited
    pd.set_option('display.max_colwidth', None) # show full column width

    corpus = sentences
    topics_arr = np.array(topics)

    for c in model.get_topic_freq()['Topic']:
        print("---------------------------------------------------------------")
        print("Cluster:", c)
        print("Words:", show_topic_info(model)['Name'].iloc[c])
        print(corpus[topics_arr == c])
        print("---------------------------------------------------------------")
        print()
        print()

In [18]:
bertopic_model, sentence_model = init_bertopic()
topics, embeddings = fit_bertopic(bertopic_model, sentence_model, preds["sentence"].tolist())

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [19]:
preds = preds.assign(cluster_id=topics, 
                     mentions=0, 
                     embedding=embeddings.tolist(), 
                     datetime='', 
                     links=''
                    )
show_topic_info(bertopic_model)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,233,-1_tesla_2024_earnings_growth,"[tesla, 2024, earnings, growth, new, vehicle, ...","[""In 2024, our vehicle volume growth rate may ..."
1,0,86,0_ev_evs_growth_market,"[ev, evs, growth, market, 2025, new, affordabl...",[The new 2024 Chevrolet Blazer EV is one of th...
2,1,69,1_recalling_recall_vehicles_software,"[recalling, recall, vehicles, software, 200, s...",[New York (CNN) — Tesla is recalling 2.2 milli...
3,2,65,2_package_billion_judge_compensation,"[package, billion, judge, compensation, ruled,...",[WILMINGTON - A Delaware judge on Tuesday rul...
4,3,57,3_texas_vote_incorporation_delaware,"[texas, vote, incorporation, delaware, shareho...","[""Tesla will move immediately to hold a shareh..."
5,4,35,4_german_berlin_factory_sea,"[german, berlin, factory, sea, red, attacks, t...",[BERLIN (AP) - Tesla says it's temporarily hal...
6,5,35,5_voting_control_greater_seeking,"[voting, control, greater, seeking, musk, robo...",[NEW YORK — (NEW YORK) -- Tesla CEO Elon Musk ...
7,6,33,6_model_highland_sedan_refreshed,"[model, highland, sedan, refreshed, america, n...",[The refreshed 2024 Tesla Model 3—codenamed 'H...
8,7,30,7_redwood_codenamed_2025_mass,"[redwood, codenamed, 2025, mass, mid, supplier...",[Tesla has told suppliers it wants to start pr...
9,8,29,8_india_indian_import_manufacturing,"[india, indian, import, manufacturing, policy,...",[India hasn't made up its mind on cutting impo...


## Postprocessing:

In [20]:
def extract_keywords(bertopic_model):
    keyword_tfidf_tuples: List[Tuple[str, float]] = bertopic_model.get_topic(topic_id)
    keywords = list(map(lambda t: t[0], keyword_tfidf_tuples))
    keywords = [word for word in keywords if word not in stopwords.words('english')]
    keywords = keywords[:EXTRACTED_KEYWORD_NUM]
    topic_heading = '-'.join(keywords)
    return topic_heading, keywords

def identify_outlier_sentences(group, keywords, topic_heading):
    indices_to_remove = []
    for row_id, row in group.iterrows():
        sentence = row['sentence']
        contained_keyword_num = sum(1 for word in keywords[:EXTRACTED_KEYWORD_NUM] if word in sentence.lower())

        if contained_keyword_num < 3:
            indices_to_remove.append(row_id)
            continue

        group.at[row_id, 'keyword_num'] = contained_keyword_num
    return indices_to_remove

def time_tag_sentences(group):
    for row_id, row in group.iterrows():
        sentence_datetime = collect_sentences_with_temp_exp([(row["sentence"], row["timestamp"])])
        if len(sentence_datetime.values()) > 0:
            dt = list(sentence_datetime.values())[0]
            group.at[row_id, 'datetime'] = dt

def detect_redundant_sentences(group):
    group_embeddings = group["embedding"].tolist()
    similarity_matrix = cosine_similarity(np.array(group_embeddings))
    similarity_matrix[similarity_matrix < DUPLICATE_SIMILARITY_THRESHOLD] = 0
    similarity_matrix[similarity_matrix >= DUPLICATE_SIMILARITY_THRESHOLD] = 1
    num_similars = np.sum(similarity_matrix, axis=1)
    group['mentions'] = group['num_duplicates'] + num_similars

    row_idxs, col_idxs = np.where(similarity_matrix == 1)
    paired_indices = list(zip(row_idxs, col_idxs))
    paired_indices = [(i, j) for i, j in paired_indices if i != j]

    group.reset_index(drop=True, inplace=True)
    proc_indeces = []
    indices_to_remove = []
    for i,j in paired_indices:
        sent_i = group.loc[i, 'sentence']
        sent_j = group.loc[j, 'sentence']

        dt_i = group.loc[i, 'datetime']
        dt_j = group.loc[j, 'datetime']

        if (dt_i and not dt_j) or (not dt_i and dt_j):
            pass
        elif len(sent_i) >= len(sent_j):
            indices_to_remove.append(j)
        elif len(sent_j) > len(sent_i):
            indices_to_remove.append(i)

        if (i,j) in proc_indeces or (j,i) in proc_indeces:
            continue
            
        group.at[i, 'links'] = group.loc[i, 'links'] + ',' + group.loc[j, 'link']
        group.at[j, 'links'] = group.loc[j, 'links'] + ',' + group.loc[i, 'link']
        proc_indeces.append((i,j))
        
    return indices_to_remove

In [21]:
# update dataframe with results (topics)
preds_grp = preds.groupby('cluster_id') # make new dataframe (group) for every topic (cluster)
groups = []
topic_ids = list(set(topics))

for topic_id in topic_ids:
    group = preds_grp.get_group(topic_id)

    topic_heading, keywords = extract_keywords(bertopic_model)
    if len(keywords) < 3: # cluster-heading lacks descriptive information
        continue
    '''
    '''
    group["keywords"] = topic_heading

    outlier_indices = identify_outlier_sentences(group, keywords, topic_heading)
    group.drop(outlier_indices, inplace=True)
    if len(group) < 2:
        continue

    time_tag_sentences(group)

    redundant_indices = detect_redundant_sentences(group)
    group.drop(redundant_indices, inplace=True)

    groups.append(group)

preds_out = pd.concat(groups, ignore_index=True)

SUCCESS 2025-01-01 00:00:00
SUCCESS 2026-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2024-02-08 00:00:00
SUCCESS 2024-02-08 00:00:00
SUCCESS 2025-06-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-06-01 00:00:00
SUCCESS 2025-06-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-06-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00


[main] WARN edu.stanford.nlp.ie.NumberNormalizer - java.lang.NumberFormatException: Bad number put into wordToNumber.  Word is: "", originally part of "-", piece # 0
  edu.stanford.nlp.ie.NumberNormalizer.wordToNumber(NumberNormalizer.java:393)
  edu.stanford.nlp.ie.NumberNormalizer.findNumbers(NumberNormalizer.java:721)
  edu.stanford.nlp.ie.NumberNormalizer.findAndMergeNumbers(NumberNormalizer.java:810)
  edu.stanford.nlp.time.TimeExpressionExtractorImpl.extractTimeExpressions(TimeExpressionExtractorImpl.java:190)
  edu.stanford.nlp.time.TimeExpressionExtractorImpl.extractTimeExpressions(TimeExpressionExtractorImpl.java:184)
  edu.stanford.nlp.time.TimeExpressionExtractorImpl.extractTimeExpressionCoreMaps(TimeExpressionExtractorImpl.java:115)
  edu.stanford.nlp.time.TimeAnnotator.annotate(TimeAnnotator.java:218)
  edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:76)
  edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:640)
  edu.stanf

SUCCESS 2030-01-01 00:00:00
SUCCESS 2030-01-01 00:00:00
SUCCESS 2030-01-01 00:00:00
SUCCESS 2030-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00
SUCCESS 2025-01-01 00:00:00


## Upload Results to DB:

In [22]:
preds_out = preds_out.drop(labels=['embedding', 'keyword_num', 'num_duplicates'], axis=1)
preds_out = preds_out.sort_values(['cluster_id', 'mentions'], ascending=[True, False])

print(preds_out)
upload_df("frontend", "topics_" + query, preds_out)
print("all done")

                                              sentence            timestamp  \
105  Tesla has lowered driving-range estimates acro...  2024-01-09 22:07:20   
108  Tesla stock plunges as EV maker warns producti...  2024-01-25 21:19:24   
109  "In 2024, our vehicle volume growth rate may b...  2024-01-24 22:28:30   
98   Tesla is developing a new platform for smaller...  2024-01-24 17:15:00   
112  Lucid is gunning for electric vehicle giant Te...  2024-01-29 03:21:36   
..                                                 ...                  ...   
93   In a series of posts on X Monday night, Musk s...  2024-01-16 14:00:26   
94   X, the social media platform formerly known as...  2024-01-10 07:10:55   
96   Elon Musk said that building Tesla's next-gene...  2024-01-26 23:30:00   
97   But 2024 is starting on a different note, with...  2024-01-13 13:30:00   

                                                  link  cluster_id  mentions  \
105  https://ca.finance.yahoo.com/news/1-tesla-low