In [None]:
# load environment variables

from dotenv import load_dotenv
import os

path_to_env = '../.env'
load_dotenv(dotenv_path=path_to_env)
DB_PASSWORD = os.environ.get('DB_PASSWORD')
DB_DOMAIN = os.environ.get('DB_DOMAIN')
NEWS_API_KEY = os.environ.get('NEWS_API_KEY')

In [None]:
# db-service

import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus

def conn_to_db(db):
  encoded = quote_plus(DB_PASSWORD)
  db_url = f"mysql+mysqldb://root:{encoded}@{DB_DOMAIN}:2306/{db}"
  engine = create_engine(db_url)
  return engine

def download_df(db, table):
  return pd.read_sql_table(table, conn_to_db(db))

def upload_to_db(db, table, df):
    df.to_sql(table, conn_to_db(db), if_exists = 'replace')

def append_to_db(db, table, df, delete):
    df.to_sql(table, conn_to_db(db), if_exists = 'append')

In [None]:
# retrieval-service
import requests

def fetch_articles(query, from_date, to_date, num_articles):
    all_articles = []
    page = 1
    per_page = 100  # max number of articles per request for Newscatcher
    headers = {"x-api-key": NEWS_API_KEY}

    while len(all_articles) < num_articles:
        params = {
            "q": query,
            "lang": "en",
            "from": from_date,
            "to": to_date,
            "page": page,
            "page_size": per_page,
            "sort_by": "relevancy"
        }

        response = requests.get("https://api.newscatcherapi.com/v2/search", params=params, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to get data: {response.content}")
            break

        articles = response.json().get('articles', [])
        all_articles.extend(articles)

        if len(articles) < per_page:
            break  # No more articles available

        page += 1
        
    articles = pd.DataFrame(all_articles)

    columns_to_keep = ['title', 'excerpt', 'published_date', 'topic', 'link']
    articles = articles.loc[:, columns_to_keep]
    
    articles.dropna(subset=['excerpt'], inplace=True)
    articles = articles[articles['excerpt'].str.strip() != '']
    
    return articles

In [None]:
# preprocessing-service

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')  # download the sentence tokenizer

def split_into_sentences(articles):
    sentences = []
    timestamps = []
    links = []
    for idx, row in articles.iterrows():
        article = row['excerpt']
        timestamp = row['published_date']
        link = row['link']
        # tokenize content into sentences
        for sentence in sent_tokenize(article):
            sentences.append(sentence)
            timestamps.append(timestamp)
            links.append(link)

    # create new dataframe with each sentence and its corresponding timestamp
    sentences_df = pd.DataFrame({
        'sentence': sentences,
        'timestamp': timestamps,
        'link': links
    })
    sentences_df.dropna(subset=['sentence'], inplace=True)
    return sentences_df

def analyze_sentence_length(sentences):
    sentences['len'] = sentences['sentence'].apply(len)
    return sentences[sentences['len'] >= 30]
    
def drop_duplicate_links(articles):
    #print(articles.duplicated(subset=['link'], keep=False).sum())
    articles.drop_duplicates(subset='link', keep='first', inplace=True) # duplicate articles from the same source are useless

def drop_duplicates(sentences):
    sentences['num_duplicates'] = sentences.groupby('sentence')['sentence'].transform('count')
    return sentences.drop_duplicates(subset='sentence', keep='first')

'''
def drop_duplicates(sentences):
    sentences['num_duplicates'] = sentences.groupby('sentence')['sentence'].transform('count')
    aggregated_sources = sentences.groupby('sentence')['link'].agg(list).reset_index()
    aggregated_sources.rename(columns={'link': 'links'}, inplace=True)
    sentences.drop_duplicates(subset='sentence', keep='first', inplace=True)
    sentences = pd.merge(sentences, aggregated_sources, on='sentence')
    sentences['links'] = sentences['links'].apply(lambda x: ','.join(x))
    return sentences
'''

def resolve_newline(sentences):
    def resolve(sentence):
        return sentence.replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
    sentences['sentence'] = sentences['sentence'].apply(resolve)

def preprocess_news(articles):
    drop_duplicate_links(articles)
    sentences = split_into_sentences(articles)
    sentences = analyze_sentence_length(sentences)
    sentences = drop_duplicates(sentences)
    resolve_newline(sentences)
    return sentences

def prepare_data_for_frontend(sentences):
    stats = {'numSentsWithDate': sentences[sentences['datetime'] == ''].shape[0],
             'numSentsWithoutDate': sentences[sentences['datetime'] != ''].shape[0],
             'numClusters': len(sentences['cluster_id'].unique())}
    topic_groups = sentences.groupby('cluster_id') # make new dataframe (group) for every topic (cluster)
    topics = []
    for cluster_id, topic_group in topic_groups:
        json_obj = {'clusterID': cluster_id, 'keywords': topic_group.iloc[0]['keywords']}
        avg_mentions = topic_group['mentions'].sum()/len(topic_group)
        json_obj['avgTotalMentions'] = float(avg_mentions)

        date_groups = topic_group.groupby('datetime')
        content_with_date = []
        content_without_date = []
        for date, date_group in date_groups:
            content = {}

            sents = []
            for row_id, row in date_group.iterrows():
                sent = {'sentence': row['sentence'],
                        'mentions': row['mentions'],
                        'link': row['link'],
                        'links': row['links'],
                        'timestamp': row['timestamp'],
                       }
                sents.append(sent)

            sum_of_mentions = date_group['mentions'].sum()
            #sum_of_mentions = len(date_group)
            content['sumOfMentions'] = float(sum_of_mentions)

            if date:
                content['datetime'] = str(row['datetime'])
                content_with_date.append(content)
            else:
                content_without_date.append(content)

            content['sentences'] = sents

        json_obj['contentWithDate'] = content_with_date
        json_obj['contentWithoutDate'] = content_without_date
        topics.append(json_obj)

    return {'stats': stats, 'data': topics}

In [None]:
# classification-service

import requests

def init_classification(parsed_query):
    print("start classification")
    url = "https://d9c8-34-124-155-12.ngrok.io/"
    response = requests.get(url + "classify/")
    if response.status_code == 200:
        print("classification successful")
    else:
        print("classification UNsuccessful")

In [None]:
# clustering-service

import requests

def init_clustering(parsed_query):
    print("start clustering")
    url = "https://d9c8-34-124-155-12.ngrok.io/"
    response = requests.get(url + "cluster/")
    if response.status_code == 200:
        print("clustering successful")
    else:
        print("clustering UNsuccessful")

In [None]:
# tagging-service

import requests

def init_tagging(parsed_query):
    print("start tagging")
    url = "https://91e9-34-125-70-82.ngrok.io/"
    response = requests.get(url + "tag/")
    if response.status_code == 200:
        print("tagging successful")
    else:
        print("tagging UNsuccessful")

In [None]:
# django-views

import json
import requests


# init retrieval, preprocessing and upload
query = "Elon Musk"
parsed_query = query.replace(" ", "_").lower()
articles = fetch_articles(query, '2023-09-10', '2023-10-09', 4000)
upload_to_db("backend", parsed_query + "_articles", articles)
articles = download_df("backend", parsed_query + "_articles")
sentences = preprocess_news(articles)
upload_to_db("backend", parsed_query + "_sentences", sentences)

# init classification, clustering, postprocessing and tagging
'''
init_classification(parsed_query)
init_clustering(parsed_query)
init_tagging(parsed_query)
'''

# init frontend-preparation
'''
sentences = download_df("frontend", "topics_" + parsed_query)
data = prepare_data_for_frontend(sentences)
print(json.dumps(data, indent=2))
'''