In [None]:
import pandas as pd
from bs4 import BeautifulSoup

## Pull in the news articles and topics

In [None]:
def parse_sgm_to_dataframe(file_path: str) -> pd.DataFrame:
    # Open and read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        sgm_data = file.read()

    # Parse the SGML data
    soup = BeautifulSoup(sgm_data, 'html.parser')

    # List to hold parsed data
    data = []

    # Iterate over each Reuters tag in the SGML
    for reuters in soup.find_all('reuters'):
        # Extract the NEWID attribute to serve as an ID
        article_id = reuters.get('newid')

        # Extract the BODY content
        body = reuters.find('body')
        body_text = body.get_text().strip() if body else ''

        # Extract the TOPICS
        topics = reuters.find('topics')
        if topics:
            # Get all topics listed under <D> tags
            topics_list = [d.get_text().strip() for d in topics.find_all('d')]
            # If there are topics, add a row for each topic
            if topics_list:
                for topic in topics_list:
                    data.append({'ID': article_id, 'Topic': topic, 'Body': body_text})
            else:
                # If <topics> tag exists but is empty, add a row with empty string for Topic
                data.append({'ID': article_id, 'Topic': '', 'Body': body_text})
        else:
            # If there's no <topics> tag, add a row with None for Topic
            data.append({'ID': article_id, 'Topic': None, 'Body': body_text})

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)
    return df

In [None]:
df = parse_sgm_to_dataframe('data/reuters21578/reut2-000.sgm')

In [None]:
df.head(15)

## Get a list of all possible topics

In [None]:
 def check_topics_in_file(topic_list, file_path):
    # Read topics from the file
    with open(file_path, 'r') as file:
        file_topics = set(file.read().splitlines())
    
    # Check if all topics in topic_list are in file_topics
    missing_topics = set(topic_list) - file_topics
    
    if not missing_topics:
        return True, []
    else:
        return False, list(missing_topics)

In [None]:
topic_list = df['Topic'].unique()

In [None]:
check_topics_in_file(topic_list, 'data/reuters21578/all-topics-strings.lc.txt')

In [None]:
topic_file = 'data/reuters21578/all-topics-strings.lc.txt'
with open(topic_file, 'r') as file:
    file_topics = set(file.read().splitlines())

In [None]:
topics = list(file_topics)

In [None]:
topics = [s.strip() for s in topics]

In [None]:
topics.sort()

## Sample from the articles

In [None]:
import pandas as pd
import random

def sample_article_with_n_topics(df: pd.DataFrame, n: int) -> dict:
    # Group the DataFrame by ID and aggregate topics into a list
    grouped = df.groupby('ID').agg({
        'Topic': lambda x: [t for t in x if pd.notna(t) and t != ''],
        'Body': 'first'
    }).reset_index()
    
    if n == 0:
        # For n = 0, find articles with no topics or empty string topics
        articles_with_n_topics = grouped[grouped['Topic'].apply(len) == 0]
    else:
        # For n > 0, find articles with exactly n topics
        articles_with_n_topics = grouped[grouped['Topic'].apply(len) == n]
    
    # If no articles found with the specified number of topics, return None
    if articles_with_n_topics.empty:
        return None
    
    # Randomly select one article
    selected_article = articles_with_n_topics.sample(n=1).iloc[0]
    
    # Create the result dictionary
    result = {
        'ID': selected_article['ID'],
        'Topics': ', '.join(selected_article['Topic']) if n > 0 else '',
        'Body': selected_article['Body']
    }
    
    return result

In [None]:
sample_article_with_n_topics(df, n=2)

## Prompt ideas

I would like you to create a Python program that will take in the text from a news article and identify any topics from a pre-defined list of possible topics. 

Here are the list of possible topics: {all_topics}

Here are a few examples:

Example 1:
Article: {article_1}
Topics: {topics_1}

Example 2:
Article: {article_2}
Topics: {topics_2}

Example 3:
Article: {article_3}
Topics: {topics_3}

The input of the program will be the text and the output/return should be a list of any topics that should be included in the article.

------------------------------------------------------------------------------------------------------------

Create a Python program that takes the text of a news article as input and identifies relevant topics from a pre-defined list. The program should be efficient and able to process multiple articles quickly.

List of possible topics: {all_topics}

The program should:
1. Preprocess the input text (e.g., remove special characters, convert to lowercase)
2. Identify topics that are explicitly mentioned or strongly implied in the article
3. Handle partial matches and consider the context of words
4. Return a list of identified topics
5. Return an empty list if no topics are identified
6. Handle potential errors gracefully

Examples:

Example 1 (Multiple topics):
Article: {article_1}
Topics: {topics_1}

Example 2 (Single topic):
Article: {article_2}
Topics: {topics_2}

Example 3 (No topics):
Article: {article_3}
Topics: []

Example 4 (Long article, truncated for brevity):
Article: {article_4}
Topics: {topics_4}

Input: The full text of a news article as a string
Output: A list of strings representing the identified topics

Please provide the Python code for this program, including any necessary functions for preprocessing, topic identification, and main execution. Also, include brief comments explaining the logic of your approach.



In [None]:
print(f'topics: {topics}')

In [None]:
no_topics = sample_article_with_n_topics(df, n=0)
single_topic = sample_article_with_n_topics(df, n=1)
random_number = random.randint(2, 4)
multiple_topics = sample_article_with_n_topics(df, n=random_number)

In [None]:
no_topics['Body']

In [None]:
single_topic

In [None]:
multiple_topics

In [None]:
def generate_prompt(all_topics, no_topics, single_topic, multiple_topics):
    result = f"""
    Create a Python program that takes the text of a news article as input and identifies relevant topics from a pre-defined list. The program should be efficient and able to process multiple articles quickly.

List of possible topics: {all_topics}

The program should:
1. Preprocess the input text (e.g., remove special characters, convert to lowercase)
2. Identify topics that are explicitly mentioned or strongly implied in the article
3. Handle partial matches and consider the context of words
4. Return a list of identified topics
5. Return an empty list if no topics are identified
6. Handle potential errors gracefully

Examples:

Example 1 (Multiple topics):
Article: {multiple_topics['Body']}
Topics: [{multiple_topics['Topics']}]

Example 2 (Single topic):
Article: {single_topic['Body']}
Topics: [{single_topic['Topics']}]

Example 3 (No topics):
Article: {no_topics['Body']}
Topics: []

Input: The full text of a news article as a string
Output: A list of strings representing the identified topics

Please provide the Python code for this program, including any necessary functions for preprocessing, topic identification, and main execution. Also, include brief comments explaining the logic of your approach.
    """

    return result

In [None]:
prompt = generate_prompt(all_topics=topics,
                no_topics=no_topics, 
                single_topic=single_topic, 
                multiple_topics=multiple_topics
               )

In [None]:
print(prompt)

## Programs

### Claude

prompt:

Create a Python program that takes the text of a news article as input and identifies relevant topics from a pre-defined list. The program should be efficient and able to process multiple articles quickly.

List of possible topics: ['acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin', 'meal-feed', 'mexpeso', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 'rape-oil', 'rapeseed', 'red-bean', 'reserves', 'retail', 'rice', 'ringgit', 'rubber', 'rupiah', 'rye', 'saudriyal', 'sfr', 'ship', 'silk', 'silver', 'singdlr', 'skr', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'stg', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil', 'veg-oil', 'wheat', 'wool', 'wpi', 'yen', 'zinc']

The program should:
1. Preprocess the input text (e.g., remove special characters, convert to lowercase)
2. Identify topics that are explicitly mentioned or strongly implied in the article
3. Handle partial matches and consider the context of words
4. Return a list of identified topics
5. Return an empty list if no topics are identified
6. Handle potential errors gracefully

Examples:

Example 1 (Multiple topics):
Article: The Commodity Credit Corporation, CCC,
has accepted an export bonus offer to cover the sale of 37,000
long tons of wheat flour to North Yemen, the U.S. Agriculture
Department said.
    The wheat four is for shipment March-May and the bonus
awarded was 119.05 dlrs per tonnes and will be paid in the form
of commodities from the CCC inventory.
    The bonus was awarded to the Pillsbury Company.
    The wheat flour purchases complete the Export Enhancement
Program initiative announced in April, 1986, it said.
 Reuter

Topics: [wheat, grain]

Example 2 (Single topic):
Article: Union Carbide Corp is looking to
acquisitions and joint ventures to aid its chemicals and
plastics growth, according the H.W. Lichtenberger, president of
Chemicals and Plastics.
    Describing this as a major departure in the company's
approach to commercial development, he told the annual new
business forum of the Commercial Development Association "We
are looking to acquisitions and joint ventures when they look
like the fastest and most promising routes to the growth
markets we've identified."
    Not very long ago Union Carbide had the attitude "that if
we couldn't do it ourselves, it wasn't worth doing. Or, if it
was worth doing, we had to go it alone," Lichtenberger
explained.
    He said "there are times when exploiting a profitable
market is done best with a partner. Nor do we see any need to
plow resources into a technology we may not have if we can link
up profitably with someone who is already there."
    He said Carbide has extended its catalyst business that way
and is now extending its specialty chemicals business in the
same way.
 Reuter

Topics: [acq]

Example 3 (No topics):
Article: The total value of Indonesia's exports
fell in calendar 1986 to 15.995 billion dlrs from 18.762
billion in calendar 1985, according to the central bank's
latest report.
    The value of crude oil exports fell to 7.431 billion dlrs
in 1986 from 8.976 billion in 1985, while liquefied natural gas
exports dropped to 2.795 billion dlrs from 3.802 billion in
1985.
    The value of Indonesia's non-oil exports also fell in
calendar 1986, to 5.768 billion dlrs from 5.983 billion in
1985.
 REUTER

Topics: []

Input: The full text of a news article as a string
Output: A list of strings representing the identified topics

Please provide the Python code for this program, including any necessary functions for preprocessing, topic identification, and main execution. Also, include brief comments explaining the logic of your approach.

Program 1

In [None]:
import re
from typing import List

# List of predefined topics
TOPICS = ['acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin', 'meal-feed', 'mexpeso', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 'rape-oil', 'rapeseed', 'red-bean', 'reserves', 'retail', 'rice', 'ringgit', 'rubber', 'rupiah', 'rye', 'saudriyal', 'sfr', 'ship', 'silk', 'silver', 'singdlr', 'skr', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'stg', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil', 'veg-oil', 'wheat', 'wool', 'wpi', 'yen', 'zinc']

def preprocess_text(text: str) -> str:
    """
    Preprocess the input text by converting to lowercase and removing special characters.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def identify_topics(text: str) -> List[str]:
    """
    Identify topics in the preprocessed text.
    """
    identified_topics = set()
    words = text.split()
    
    for topic in TOPICS:
        # Check for exact matches
        if topic in words:
            identified_topics.add(topic)
        else:
            # Check for partial matches and context
            topic_parts = topic.split('-')
            if all(part in words for part in topic_parts):
                identified_topics.add(topic)
    
    return list(identified_topics)

def process_article(article: str) -> List[str]:
    """
    Process a single article and return identified topics.
    """
    try:
        preprocessed_text = preprocess_text(article)
        topics = identify_topics(preprocessed_text)
        return topics
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        return []

def main():
    """
    Main function to demonstrate the usage of the program.
    """
    article = input("Enter the news article text: ")
    topics = process_article(article)
    
    if topics:
        print("Identified topics:", topics)
    else:
        print("No topics identified.")

if __name__ == "__main__":
    main()

In [None]:
article = sample_article_with_n_topics(df, n=2)

In [None]:
article

In [None]:
process_article(article['Body'])

Program 2 (with fuzzy matching)

In [None]:
import re
from typing import List
from fuzzywuzzy import fuzz

# You might need to install the fuzzywuzzy library:
# pip install fuzzywuzzy python-Levenshtein

# List of predefined topics
TOPICS = ['acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin', 'meal-feed', 'mexpeso', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 'rape-oil', 'rapeseed', 'red-bean', 'reserves', 'retail', 'rice', 'ringgit', 'rubber', 'rupiah', 'rye', 'saudriyal', 'sfr', 'ship', 'silk', 'silver', 'singdlr', 'skr', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'stg', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil', 'veg-oil', 'wheat', 'wool', 'wpi', 'yen', 'zinc']

def preprocess_text(text: str) -> str:
    """
    Preprocess the input text by converting to lowercase and removing special characters.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def fuzzy_match(word: str, topic: str, threshold: int = 80) -> bool:
    """
    Perform fuzzy matching between a word and a topic.
    Returns True if the match ratio is above the threshold.
    """
    return fuzz.ratio(word, topic) >= threshold

def identify_topics(text: str) -> List[str]:
    """
    Identify topics in the preprocessed text using fuzzy matching.
    """
    identified_topics = set()
    words = text.split()
    
    for topic in TOPICS:
        # Check for exact matches
        if topic in words:
            identified_topics.add(topic)
        else:
            # Check for fuzzy matches
            topic_parts = topic.split('-')
            if all(any(fuzzy_match(word, part) for word in words) for part in topic_parts):
                identified_topics.add(topic)
    
    return list(identified_topics)

def process_article(article: str) -> List[str]:
    """
    Process a single article and return identified topics.
    """
    try:
        preprocessed_text = preprocess_text(article)
        topics = identify_topics(preprocessed_text)
        return topics
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        return []

def main():
    """
    Main function to demonstrate the usage of the program.
    """
    article = input("Enter the news article text: ")
    topics = process_article(article)
    
    if topics:
        print("Identified topics:", topics)
    else:
        print("No topics identified.")

if __name__ == "__main__":
    main()

In [None]:
article = sample_article_with_n_topics(df, n=0)

In [None]:
article

In [None]:
process_article(article['Body'])

## Llama 3.1 70B

In [None]:
import re
from typing import List

def preprocess_text(text: str) -> str:
    """
    Preprocess the input text by removing special characters and converting to lowercase.

    Args:
    text (str): The input text.

    Returns:
    str: The preprocessed text.
    """
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

def identify_topics(text: str, topics: List[str]) -> List[str]:
    """
    Identify topics that are explicitly mentioned or strongly implied in the article.

    Args:
    text (str): The preprocessed text.
    topics (List[str]): The list of possible topics.

    Returns:
    List[str]: A list of identified topics.
    """
    # Initialize an empty list to store identified topics
    identified_topics = []

    # Iterate over each topic
    for topic in topics:
        # Check if the topic is mentioned in the text
        if topic in text:
            # Add the topic to the list of identified topics
            identified_topics.append(topic)

    # Return the list of identified topics
    return identified_topics

def main(text: str) -> List[str]:
    """
    Main function to identify topics in a news article.

    Args:
    text (str): The full text of a news article.

    Returns:
    List[str]: A list of strings representing the identified topics.
    """
    # Preprocess the input text
    preprocessed_text = preprocess_text(text)

    # Define the list of possible topics
    topics = ['acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin','meal-feed','mexpeso','money-fx','money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 'rape-oil', 'rapeseed','red-bean','reserves','retail', 'rice', 'ringgit', 'rubber', 'rupiah', 'rye','saudriyal','sfr','ship','silk','silver','singdlr','skr','sorghum','soy-meal','soy-oil','soybean','stg','strategic-metal','sugar','sun-meal','sun-oil','sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil','veg-oil', 'wheat', 'wool', 'wpi', 'yen', 'zinc']

    # Identify topics in the preprocessed text
    identified_topics = identify_topics(preprocessed_text, topics)

    # Return the list of identified topics
    return identified_topics

# Example usage:
text = "The Commodity Credit Corporation, CCC, has accepted an export bonus offer to cover the sale of 37,000 long tons of wheat flour to North Yemen, the U.S. Agriculture Department said."
print(main(text))  # Output: ['wheat', 'grain']

text = "Union Carbide Corp is looking to acquisitions and joint ventures to aid its chemicals and plastics growth, according the H.W. Lichtenberger, president of Chemicals and Plastics."
print(main(text))  # Output: ['acq']

text = "The total value of Indonesia's exports fell in calendar 1986 to 15.995 billion dlrs from 18.762 billion in calendar 1985, according to the central bank's latest report."
print(main(text))  # Output: []

## GPT-4o

In [None]:
import re

# Pre-defined list of topics
TOPICS = [
    'acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 
    'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 
    'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 
    'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 
    'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 
    'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 
    'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 
    'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 
    'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin', 'meal-feed', 'mexpeso', 
    'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 
    'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 
    'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 
    'rape-oil', 'rapeseed', 'red-bean', 'reserves', 'retail', 'rice', 'ringgit', 'rubber', 
    'rupiah', 'rye', 'saudriyal', 'sfr', 'ship', 'silk', 'silver', 'singdlr', 'skr', 'sorghum', 
    'soy-meal', 'soy-oil', 'soybean', 'stg', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 
    'sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil', 'veg-oil', 'wheat', 'wool', 
    'wpi', 'yen', 'zinc'
]

def preprocess_text(text):
    """
    Preprocess the input text by removing special characters and converting to lowercase.
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def identify_topics(text, topics):
    """
    Identify topics that are explicitly mentioned or strongly implied in the article.
    """
    identified_topics = []
    # Preprocess text
    processed_text = preprocess_text(text)
    
    for topic in topics:
        # Check for exact matches and contextually related matches
        if re.search(r'\b' + topic.replace('-', r'[-\s]') + r'\b', processed_text):
            identified_topics.append(topic)
    
    return identified_topics

def main():
    # Example articles
    articles = [
        "The Commodity Credit Corporation, CCC, has accepted an export bonus offer to cover the sale of 37,000 long tons of wheat flour to North Yemen, the U.S. Agriculture Department said. The wheat four is for shipment March-May and the bonus awarded was 119.05 dlrs per tonnes and will be paid in the form of commodities from the CCC inventory. The bonus was awarded to the Pillsbury Company. The wheat flour purchases complete the Export Enhancement Program initiative announced in April, 1986, it said. Reuter",
        "Union Carbide Corp is looking to acquisitions and joint ventures to aid its chemicals and plastics growth, according the H.W. Lichtenberger, president of Chemicals and Plastics. Describing this as a major departure in the company's approach to commercial development, he told the annual new business forum of the Commercial Development Association 'We are looking to acquisitions and joint ventures when they look like the fastest and most promising routes to the growth markets we've identified.' Not very long ago Union Carbide had the attitude 'that if we couldn't do it ourselves, it wasn't worth doing. Or, if it was worth doing, we had to go it alone,' Lichtenberger explained. He said 'there are times when exploiting a profitable market is done best with a partner. Nor do we see any need to plow resources into a technology we may not have if we can link up profitably with someone who is already there.' He said Carbide has extended its catalyst business that way and is now extending its specialty chemicals business in the same way. Reuter",
        "The total value of Indonesia's exports fell in calendar 1986 to 15.995 billion dlrs from 18.762 billion in calendar 1985, according to the central bank's latest report. The value of crude oil exports fell to 7.431 billion dlrs in 1986 from 8.976 billion in 1985, while liquefied natural gas exports dropped to 2.795 billion dlrs from 3.802 billion in 1985. The value of Indonesia's non-oil exports also fell in calendar 1986, to 5.768 billion dlrs from 5.983 billion in 1985. REUTER"
    ]
    
    # Process each article
    for i, article in enumerate(articles):
        topics = identify_topics(article, TOPICS)
        print(f"Article {i+1}: Topics: {topics}")

if __name__ == "__main__":
    main()


## TODO

Feed in the function and some examples of mistakes it made and try to get it to improve/update the function and see if it's able to add accuracy or robustness aside from simply pattern matching.

## Feedback improvements

The idea is to feed in the function along with some examples of mistakes to allow the LLM to make improvements

### Iteration 1

In [None]:
import re
from typing import List

# List of predefined topics
TOPICS = ['acq', 'alum', 'austdlr', 'austral', 'barley', 'bfr', 'bop', 'can', 'carcass', 'castor-meal', 'castor-oil', 'castorseed', 'citruspulp', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'corn-oil', 'cornglutenfeed', 'cotton', 'cotton-meal', 'cotton-oil', 'cottonseed', 'cpi', 'cpu', 'crude', 'cruzado', 'dfl', 'dkr', 'dlr', 'dmk', 'drachma', 'earn', 'escudo', 'f-cattle', 'ffr', 'fishmeal', 'flaxseed', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-meal', 'groundnut-oil', 'heat', 'hk', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'inventories', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-meal', 'lin-oil', 'linseed', 'lit', 'livestock', 'lumber', 'lupin', 'meal-feed', 'mexpeso', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-meal', 'palm-oil', 'palmkernel', 'peseta', 'pet-chem', 'platinum', 'plywood', 'pork-belly', 'potato', 'propane', 'rand', 'rape-meal', 'rape-oil', 'rapeseed', 'red-bean', 'reserves', 'retail', 'rice', 'ringgit', 'rubber', 'rupiah', 'rye', 'saudriyal', 'sfr', 'ship', 'silk', 'silver', 'singdlr', 'skr', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'stg', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tapioca', 'tea', 'tin', 'trade', 'tung', 'tung-oil', 'veg-oil', 'wheat', 'wool', 'wpi', 'yen', 'zinc']

def preprocess_text(text: str) -> str:
    """
    Preprocess the input text by converting to lowercase and replacing newlines with spaces.
    """
    text = text.lower()
    text = text.replace('\n', ' ')
    return text

def identify_topics(text: str) -> List[str]:
    """
    Identify topics in the preprocessed text.
    """
    identified_topics = set()
    
    for topic in TOPICS:
        # Check for exact matches and matches within words
        if re.search(r'\b' + re.escape(topic) + r'\b', text) or re.search(r'\b' + re.escape(topic) + r's?\b', text):
            identified_topics.add(topic)
        elif '-' in topic:
            # Check for compound topics
            parts = topic.split('-')
            if all(re.search(r'\b' + re.escape(part) + r'\b', text) for part in parts):
                identified_topics.add(topic)
    
    return list(identified_topics)

def process_article(article: str) -> List[str]:
    """
    Process a single article and return identified topics.
    """
    try:
        preprocessed_text = preprocess_text(article)
        topics = identify_topics(preprocessed_text)
        return topics
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        return []

In [None]:
article = sample_article_with_n_topics(df, n=5)

In [None]:
article

In [None]:
process_article(article['Body'])

In [None]:
article = sample_article_with_n_topics(df, n=1)

In [None]:
article

In [None]:
process_article(article['Body'])