In [12]:
! pip install -e ..

Obtaining file:///Users/michael.pryse-davies/Work/themefinder_structured_test/themefinder
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: themefinder
  Building editable for themefinder (pyproject.toml) ... [?25ldone
[?25h  Created wheel for themefinder: filename=themefinder-0.6.3-py3-none-any.whl size=4752 sha256=5afabaeecd4065d9ce5ab7bbdf41eeb01d5d3442536becaa1bf98ea7ae0a021c
  Stored in directory: /private/var/folders/9t/_n30f4mj5l30jj0sn1bgg7tn79_mc1/T/pip-ephem-wheel-cache-my8sza3_/wheels/7e/90/99/15f64c5313fafa73552770d581a4101838856f7605105ce36b
Successfully built themefinder
Installing collected packages: themefinder
  Attempting uninstall: themefinder
    Found existing installation: themefinder 0.6.2
    Uninstalling themefinder-0.6.2:


In [1]:
import json
import pandas as pd
import themefinder
from langchain_openai import AzureChatOpenAI
import string

In [2]:
# Define the question the responses are answering and load the response data

question = "What improvements would you most like to see in local public transportation?"

responses = pd.read_json("./example_data.json")

In [3]:
# Check the dataframe has the following columns: response_id, response
# response_ids should start from 1
responses

Unnamed: 0,response_id,response
0,1,"Buses need to run more frequently, especially ..."
1,2,The schedule says every 15 minutes but I've be...
2,3,"Service is pretty reliable during the day, but..."
3,4,I rely on the 7:45 AM bus to get to work on ti...
4,5,24/7 service on main routes would be amazing f...
...,...,...
195,196,Recycling programs on vehicles and at stations.
196,197,Energy-efficient LED lighting throughout the t...
197,198,Biodiesel or other alternative fuels for exist...
198,199,Green roof systems on transit facilities for e...


In [4]:
# Create the LLM object for your use-case e.g. ChatGoogleGenerativeAI if using Google's Gemini or ChatAnthropic for Claude
# NOTE: make sure your .env file is correctly set up with the correct API key/any other variables you need

llm = AzureChatOpenAI(
    model_name="gpt-4o",
    temperature=0
)

In [None]:
# Running the whole pipeline end-to-end in one go.
results = await themefinder.find_themes(
    responses, 
    llm=llm, 
    question=question,
    )

2025-06-25 11:26:10,880 INFO: Running sentiment analysis on 200 responses
2025-06-25 11:26:10,883 INFO: Running batch and run with batch size 20
2025-06-25 11:26:18,703 INFO: Running theme generation on 200 responses
2025-06-25 11:26:18,706 INFO: Running batch and run with batch size 50
2025-06-25 11:26:37,103 INFO: Running theme condensation on 198 themes
2025-06-25 11:26:37,105 INFO: 198 larger than batch size, using recursive theme condensation
2025-06-25 11:26:37,108 INFO: Running batch and run with batch size 75


In [6]:
results["themes"]

Unnamed: 0,topic_id,topic,source_topic_count
0,1,Affordable and inclusive pricing: Income-based...,5
1,2,Suburban and cross-town connectivity: Need for...,3
2,3,Information Accessibility: Make schedule and r...,2
3,4,Convenience and amenities: Provide luggage rac...,3
4,5,Lost and Found Service: Improve the effectiven...,1
5,6,Cleanliness and maintenance: Enhance cleanline...,14
6,7,Safety and security enhancements: Better light...,10
7,8,Payment System Improvements: Simplify and mode...,4
8,9,WiFi and Connectivity: Provide reliable and fa...,4
9,10,Effective Communication: Enhance announcements...,2


In [None]:
# The results of each stage of the pipeline can be viewed by accessing the keys of the returned dictionary e.g.
results["themes"]
# or
results["mapping"]

Unnamed: 0,response_id,response,labels,reasons,stances
0,1,"Buses need to run more frequently, especially ...",[9],[The response mentions the need for buses to r...,[NEGATIVE]
1,2,The schedule says every 15 minutes but I've be...,[3],[The response mentions the need for real-time ...,[NEGATIVE]
2,3,"Service is pretty reliable during the day, but...",[9],[The response mentions that service is unrelia...,[NEGATIVE]
3,4,I rely on the 7:45 AM bus to get to work on ti...,[9],[The response mentions that the bus is late at...,[NEGATIVE]
4,5,24/7 service on main routes would be amazing f...,[9],[The response mentions the need for 24/7 servi...,[POSITIVE]
...,...,...,...,...,...
195,196,Recycling programs on vehicles and at stations.,[4],[The response mentions recycling programs on v...,[POSITIVE]
196,197,Energy-efficient LED lighting throughout the t...,[4],[The response mentions energy-efficient LED li...,[POSITIVE]
197,198,Biodiesel or other alternative fuels for exist...,[4],[The response mentions biodiesel or other alte...,[POSITIVE]
198,199,Green roof systems on transit facilities for e...,[4],[The response mentions green roof systems on t...,[POSITIVE]


In [None]:
results["themes"]

Unnamed: 0,topic_id,topic,source_topic_count
0,1,"Comfort and Amenities: Enhance seat comfort, t...",18
1,2,Accessibility Enhancements: Ensure low-floor b...,18
2,3,Information and Usability: Improve website usa...,26
3,4,Environmental Sustainability: Adopt electric b...,12
4,5,"Safety and Security: Enhance route safety, add...",18
5,6,Coordination and Efficiency: Improve coordinat...,18
6,7,Driver Training and Customer Service: Enhance ...,17
7,8,Fare Structure and Payment Systems: Address is...,22
8,9,Transit Reliability and Punctuality: Improve b...,10
9,10,Cleanliness and Maintenance: Enhance cleanline...,10


In [None]:
# If you want to modify the themes generated by the LLM such as merging similar themes or adding in new themes such as a default fallback theme like "Other", this can be done by directly modifying the themes and feeding them into the mapping stage of the pipeline.
from themefinder import theme_mapping

themes = results["themes"][["topic_id", "topic"]].copy()
themes.loc[len(themes)] = {"topic_id": string.ascii_uppercase[len(themes)], "topic": "Other: The response does not match any of the listed themes"}

In [None]:
# It is possible for an LLM to be unable to process a response, if is too long or violates the models content filters, these responses can be reviewed in the 2nd element of the returned object for each task
mapping, unprocessed = await theme_mapping(
    responses,
    llm=llm,
    refined_themes_df=themes,
    question=question,
)

2025-06-25 11:21:09,669 INFO: Running theme mapping on 200 responses using 11 themes
2025-06-25 11:21:09,672 INFO: Running batch and run with batch size 20


CancelledError: 

In [None]:
# To export the mapping to a spreadhseet
mapping.to_excel("mapping.xlsx")