In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
from datetime import datetime, timedelta
import math
import json

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sqlalchemy import create_engine
from urllib.parse import quote_plus
import importlib
import pathlib

from sutime import SUTime
from typing import *

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from dotenv import load_dotenv
import os

In [None]:
#!apt-get -q install maven
pom_path = pathlib.Path(importlib.util.find_spec("sutime").origin).parent / "pom.xml"
!mvn dependency:copy-dependencies -DoutputDirectory=./jars -f {pom_path}

In [None]:
path_to_env = '../.env'
load_dotenv(dotenv_path=path_to_env)
DB_PASSWORD = os.environ.get('DB_PASSWORD')
DB_DOMAIN = os.environ.get('DB_DOMAIN')

## Time Tagging Functions:

In [None]:
sutime = SUTime(mark_time_ranges=True, include_range=True)
current_dt = datetime.now()
#current_dt = datetime.strptime('04-29-2023', "%m-%d-%Y")

def discard(txt1, txt2=""): # for DEBUGGING
    #print("<<", txt1, txt2, ">>")
    #print()
    return None

def extract_temp_info(sentence: str, timestamp: str):
    # parse the sentence along with its parent articles publication date using SUTime and convert the output to a JSON object
    parsed_temp_info = json.loads(json.dumps(sutime.parse(sentence, timestamp)))
    # check if any temporal information was found
    if len(parsed_temp_info) == 0:
        return discard(parsed_temp_info, "no time expressions found")
    return parsed_temp_info

def convert_timex_to_datetime(timex: str):
    timex = timex.replace('T', ' ')
    timex = timex.replace('X', '5') # for a decade, take the middle of it -> year ...5

    # replace season codes with approximate dates
    timex = timex.replace('FA', '11-06')
    timex = timex.replace('SU', '08-06')
    timex = timex.replace('SP', '05-05')
    timex = timex.replace('WI', '02-04')

    # try converting using each defined date format
    date_formats = [
        '%Y-%m-%d',
        '%Y-%m',
        '%Y-W%W-%w',
        '%Y-W%W',
        '%Y',
        '%Y-%j',
    ]
    for date_format in date_formats:
        try:
            return datetime.strptime(timex, date_format)
        except ValueError:
            continue

    # if conversion failed for all formats
    return discard(timex, "could not convert to datetime")


def extract_datetime(temporal_info: Dict):
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    text_info = temporal_info.get('text', "")

    if text_info.lower() in days_of_week: # references to weekdays are discarded bc. SUTime can't handle them well
        return discard(temporal_info, "reference to weekday")

    temporal_type = temporal_info.get('type', "")
    timex_value = temporal_info.get('timex-value', "")

    if timex_value:
        if temporal_type == "DATE":
            parsed_datetime = convert_timex_to_datetime(timex_value)
            if not parsed_datetime:
                return discard(temporal_info, "timex-parsing failed")
            return parsed_datetime
        else:
            return discard(temporal_info, "timex-type not supported")
    else:
        return discard(temporal_info, "timex detected but not extracted")

def collect_sentences_with_temp_exp(sentence_timestamp_pairs: List[Tuple[str, str]]):
    # store sentences with their referenced maximum datetime
    sentence_datetime_pairs = {}
    for sentence, timestamp in sentence_timestamp_pairs:
        temporal_info_list = extract_temp_info(sentence, timestamp) # sentence can contain multiple TIMEXes

        if not temporal_info_list: # if no temporal information is found, skip to the next iteration
            continue

        # extract datetime values from the temporal information
        datetime_values = [extract_datetime(temporal_info) for temporal_info in temporal_info_list]
        # find the maximum datetime value among the extracted datetime values
        max_datetime_value = max(filter(None, datetime_values), default=None)
        # validate the maximum datetime value before adding it to the dictionary
        if (not max_datetime_value or
            max_datetime_value <= current_dt or
            max_datetime_value.year - current_dt.year > 200):
            continue

        print("SUCCESS", max_datetime_value)
        sentence_datetime_pairs[sentence] = max_datetime_value

    return sentence_datetime_pairs

## Set Up Database Connections:

In [None]:
def conn_to_db(db):
  password = quote_plus(DB_PASSWORD)
  db_url = f"mysql+mysqldb://root:{password}@{DB_DOMAIN}:2306/{db}"
  engine = create_engine(db_url)
  return engine

def download_df(db, table):
  return pd.read_sql_table(table, conn_to_db(db))

def upload_df(db, table, df):
  df.to_sql(table, conn_to_db(db), if_exists = 'replace')

def append_to_db(db, table, df):
  df.to_sql(table, conn_to_db(db), if_exists = 'append')

## Set Constants For Postprocessing:

In [None]:
EXTRACTED_KEYWORD_NUM = 6
DUPLICATE_SIMILARITY_THRESHOLD = 0.7

## Load Sentences:

In [None]:
query = "elon_musk"
preds = download_df("backend", query + "_positives")
preds = preds.loc[:, ["sentence", "timestamp", "link", "num_duplicates"]] # only keep those columns

## Cluster With BERTopic:

In [None]:
def init_bertopic():
    sent_model = SentenceTransformer('all-MiniLM-L6-v2')
    vectorizer_model = CountVectorizer(stop_words="english")
    bertopic_model = BERTopic(
                              embedding_model=sent_model,
                              vectorizer_model=vectorizer_model,
                              top_n_words=10,
                              n_gram_range=(1,1),
                              min_topic_size=7,
                             )
                              #nr_topics=100
    return bertopic_model, sent_model

def fit_bertopic(topic_model, sentence_model, sentences):
    embeddings = sentence_model.encode(sentences)
    topics, probabilities = topic_model.fit_transform(sentences, embeddings)
    return topics, embeddings

def show_topic_info(model):
    freq = model.get_topic_info()
    return freq

def output_topic_contents(model, sentences, topics):
    pd.set_option('display.max_columns', None) # show all columns
    pd.set_option('display.width', None) # set the width of the display to be unlimited
    pd.set_option('display.max_colwidth', None) # show full column width

    corpus = sentences
    topics_arr = np.array(topics)

    for c in model.get_topic_freq()['Topic']:
        print("---------------------------------------------------------------")
        print("Cluster:", c)
        print("Words:", show_topic_info(model)['Name'].iloc[c])
        print(corpus[topics_arr == c])
        print("---------------------------------------------------------------")
        print()
        print()

In [None]:
bertopic_model, sentence_model = init_bertopic()
topics, embeddings = fit_bertopic(bertopic_model, sentence_model, preds["sentence"].tolist())

In [None]:
preds = preds.assign(cluster_id=topics, 
                     mentions=0, 
                     embedding=embeddings.tolist(), 
                     datetime='', 
                     links=''
                    )
show_topic_info(bertopic_model)

## Postprocessing:

In [None]:
def extract_keywords(bertopic_model):
    keyword_tfidf_tuples: List[Tuple[str, float]] = bertopic_model.get_topic(topic_id)
    keywords = list(map(lambda t: t[0], keyword_tfidf_tuples))
    keywords = [word for word in keywords if word not in stopwords.words('english')]
    keywords = keywords[:EXTRACTED_KEYWORD_NUM]
    topic_heading = '-'.join(keywords)
    return topic_heading, keywords

def identify_outlier_sentences(group, keywords, topic_heading):
    indices_to_remove = []
    for row_id, row in group.iterrows():
        sentence = row['sentence']
        contained_keyword_num = sum(1 for word in keywords[:EXTRACTED_KEYWORD_NUM] if word in sentence.lower())

        if contained_keyword_num < 3:
            indices_to_remove.append(row_id)
            continue

        group.at[row_id, 'keyword_num'] = contained_keyword_num
    return indices_to_remove

def time_tag_sentences(group):
    for row_id, row in group.iterrows():
        sentence_datetime = collect_sentences_with_temp_exp([(row["sentence"], row["timestamp"])])
        if len(sentence_datetime.values()) > 0:
            dt = list(sentence_datetime.values())[0]
            group.at[row_id, 'datetime'] = dt

def detect_redundant_sentences(group):
    group_embeddings = group["embedding"].tolist()
    similarity_matrix = cosine_similarity(np.array(group_embeddings))
    similarity_matrix[similarity_matrix < DUPLICATE_SIMILARITY_THRESHOLD] = 0
    similarity_matrix[similarity_matrix >= DUPLICATE_SIMILARITY_THRESHOLD] = 1
    num_similars = np.sum(similarity_matrix, axis=1)
    group['mentions'] = group['num_duplicates'] + num_similars

    row_idxs, col_idxs = np.where(similarity_matrix == 1)
    paired_indices = list(zip(row_idxs, col_idxs))
    paired_indices = [(i, j) for i, j in paired_indices if i != j]

    group.reset_index(drop=True, inplace=True)
    proc_indeces = []
    indices_to_remove = []
    for i,j in paired_indices:
        sent_i = group.loc[i, 'sentence']
        sent_j = group.loc[j, 'sentence']

        dt_i = group.loc[i, 'datetime']
        dt_j = group.loc[j, 'datetime']

        if (dt_i and not dt_j) or (not dt_i and dt_j):
            pass
        elif len(sent_i) >= len(sent_j):
            indices_to_remove.append(j)
        elif len(sent_j) > len(sent_i):
            indices_to_remove.append(i)

        if (i,j) in proc_indeces or (j,i) in proc_indeces:
            continue
            
        group.at[i, 'links'] = group.loc[i, 'links'] + ',' + group.loc[j, 'link']
        group.at[j, 'links'] = group.loc[j, 'links'] + ',' + group.loc[i, 'link']
        proc_indeces.append((i,j))
        
    return indices_to_remove

In [None]:
# update dataframe with results (topics)
preds_grp = preds.groupby('cluster_id') # make new dataframe (group) for every topic (cluster)
groups = []
topic_ids = list(set(topics))

for topic_id in topic_ids:
    group = preds_grp.get_group(topic_id)

    topic_heading, keywords = extract_keywords(bertopic_model)
    if len(keywords) < 3: # cluster-heading lacks descriptive information
        continue
    '''
    '''
    group["keywords"] = topic_heading

    outlier_indices = identify_outlier_sentences(group, keywords, topic_heading)
    group.drop(outlier_indices, inplace=True)
    if len(group) < 2:
        continue

    time_tag_sentences(group)

    redundant_indices = detect_redundant_sentences(group)
    group.drop(redundant_indices, inplace=True)

    groups.append(group)

preds_out = pd.concat(groups, ignore_index=True)

## Upload Results to DB:

In [None]:
preds_out = preds_out.drop(labels=['embedding', 'keyword_num', 'num_duplicates'], axis=1)
preds_out = preds_out.sort_values(['cluster_id', 'mentions'], ascending=[True, False])

print(preds_out)
#upload_df("frontend", "topics_" + query, preds_out)
print("all done")