In [None]:
import pandas as pd
import numpy as np
import rtsvg
rt = rtsvg.RACETrack()
import os
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
model = 'deepseek-r1:32b'
def promptModel(prompt):
    response: ChatResponse = chat(model=model, messages=[{ 'role': 'user', 'content': prompt,},])
    return response['message']['content']
_results_ = promptModel('What is 55*3?  Return a single number.') # force the model load

# Overall Game Plan For Analysis
Given the overall task from the challenge & what data was included, come up with a plan for the analysis.  All of the responses look reasonable.  We'll use the agreed upon parts from all three of the response.

In [None]:
# Define the overall task of the challenge (literally copied from the VAST Challenge)
_overall_task_ = '''A staff members of the GASTech oil and gas company has disappeared on location on the island of Kronos.
A group named the Protectors of Kronos (POK) was the prime suspect in the disappearance.
The goal of our analysis is to determine who was behind the disappearance and generate a synopsis of events.'''
_available_data_ = '''The following data is available:
- Multiple types of text data to provide a timeline of key events and characterize the POK
- individuals' movement and financial data to provide patterns of daily life
- real-time streaming social media and emergency service data to provide hostage and kidnapper information'''
#_results_ = promptModel(_overall_task_ + '\n\n' + _available_data_ + '\n\nWhat is the first step of our investigation?') # 0 failed attempts, first three responses below
#print(_results_)
# Here's the information from the model for that last prompt
_overall_plan_ = '''
The first step in investigating the disappearance of the GASTech staff member on Kronos island is to conduct a comprehensive background analysis. This involves:
1. **Researching Backgrounds**: Investigate both the missing individual's role at GASTech, recent interactions, and behaviors, as well as the POK group's history, motives, past activities, and any previous incidents they've been involved in.
2. **Text Data Analysis**: Review available text data including news articles, internal communications, and social media to establish timelines of key events and characterize POK's activities.
3. **Movement Patterns**: Analyze movement data to track the locations of both the missing person and POK members before the disappearance, identifying any intersections or anomalies.
4. **Financial Data Review**: Examine financial transactions for unusual activity that might indicate planning or motives related to the disappearance.
5. **Real-Time Data Utilization**: Use social media and emergency service data to understand immediate responses and witness accounts post-disappearance.
6. **Contextual Factors**: Consider external factors like weather conditions and local tensions on Kronos that could provide context for the incident.
7. **Data Security and Accuracy**: Ensure all data sources are secure, reliable, and analyzed systematically within a structured framework.
This foundational research will establish a timeline, uncover potential patterns or motives, and guide further investigative steps.'''
# Here's another version of what the model recommends:
_overall_plan_2_ = '''
The first step in the investigation would be to gather and verify all available data related to the disappearance of the GASTech staff member on Kronos island. 
This involves confirming the exact timeline, location, and circumstances surrounding the disappearance, as well as obtaining detailed information about the missing individual's movements, financial activities, and any potential connections or interactions with the Protectors of Kronos (POK). 
Additionally, it is crucial to collect and analyze real-time social media and emergency service data for any immediate leads or witness accounts. 
This foundational step ensures a comprehensive understanding of the situation before proceeding with further investigative actions.'''
# And a third version of the model's recommendation:
_overall_plan_3_ = '''
The first step in investigating the disappearance of the GASTech staff member on Kronos island is to systematically gather all available data from reliable sources. Here's a structured approach:
1. **Text Data Collection:**
   - Gather news articles, official reports, and internal communications to build a timeline of events and understand the motivations or history of the Protectors of Kronos (POK).
2. **Movement and Financial Data:**
   - Obtain movement data from internal company records or tracking systems to identify patterns in the staff member's movements before their disappearance.
   - Review financial transactions for any unusual activity, such as large sums received or paid around the time of the disappearance.
3. **Social Media and Emergency Services Monitoring:**
   - Use real-time tools to monitor social media platforms for posts related to the disappearance, including potential communications from POK.
   - Access emergency service logs and incident reports to gather information on any relevant calls or activities around the time of the disappearance.
4. **Team Assembly and Legal Consultation:**
   - Assemble a team with access to these datasets, involving IT departments for internal data and law enforcement for emergency services information.
   - Consult legal advisors to ensure ethical and legal compliance when accessing personal data, balancing thoroughness with urgency.
By systematically collecting this data, the investigation will establish a solid foundation for further analysis.'''

# Building A Timeline
In this section, we'll ask the model to build a timeline of events based on news articles.  The first and last response are similar.  They captured the following events (copying, pasting and formatting here):
1. **GAStech Employees Kidnapped (January 20, 2014):**
   - Media reports indicate that several GAStech employees were kidnapped. The exact number of victims and their identities remain unclear.
   - A mystery jet departed from Abila Airport on the same day, but the passengers on board have not been identified.
2. **Construction Site Incident (January 20, 2014):**
   - A blog post describes a robbery or kidnapping event involving "dark clothed" figures at a GAStech construction site. The exact number of employees involved is unknown due to confusion during the evacuation.
3. **POK Labeled as a Public Threat (July 28, 2011):**
   - Government spokesman Rufus Drymiau declared that POK (Protectors of Kronos) pose a public threat due to violent demonstrations and vandalism. The group has caused significant damage to buildings and properties.
4. **Elian Karel's Death in Jail (June 20, 2009):**
   - Elian Karel, a member of POK, died while incarcerated at Abila City Prison. He was expected to be released shortly before his death.
5. **GAStech and Environmental Issues:**
   - In 1995, GAStech's HASR technique for gas extraction was banned in Tethys due to environmental concerns about water contamination.
   - President Araullo announced plans to build a new Abila General Hospital in 1995.
These events suggest that GAStech employees were targeted in a kidnapping incident on January 20, 2014. The mystery jet and the construction site incident are likely related. 
Additionally, POK's history of violent behavior and Elian Karel's death may provide context for potential involvement or connections to these events.

In [None]:
# This task is distilled from what was produced by the model for the "overall plan"
_task_1_   = 'Our first task is to build a timeline of events based on news articles.  Here are all of the news articles.  Build a timeline of relevant events.'
_articles_ = []
_dir_ = '../../../data/2014_vast/MC1/'

df = pd.read_parquet(_dir_ + 'articles.parquet')
for k, k_df in df.groupby('file'):
    k_df       = k_df.sort_values('sentence_no').reset_index()
    _article_  = ''.join(k_df['sentence'])
    _articles_.append(f'Article Title: {k_df.iloc[0]["title"]}\n\nArticle Date: {k_df.iloc[0]["published"]}\n\nArticle Text: {_article_}')
_all_articles_ = '\n\n'.join(_articles_)
#_results_ = promptModel(_overall_task_ + '\n\n' + _available_data_ + '\n\n' + _task_1_ + '\n\n' + _all_articles_) # 0 failed attempts, first three responses below
#print(_results_)
_article_timeline_ = '''
Based on the provided articles, here is a summary of the key events related to your query:
1. **GAStech Employees Kidnapped (January 20, 2014):**
   - Media reports indicate that several GAStech employees were kidnapped. The exact number of victims and their identities remain unclear.
   - A mystery jet departed from Abila Airport on the same day, but the passengers on board have not been identified.
2. **Construction Site Incident (January 20, 2014):**
   - A blog post describes a robbery or kidnapping event involving "dark clothed" figures at a GAStech construction site. The exact number of employees involved is unknown due to confusion during the evacuation.
3. **POK Labeled as a Public Threat (July 28, 2011):**
   - Government spokesman Rufus Drymiau declared that POK (Protectors of Kronos) pose a public threat due to violent demonstrations and vandalism. The group has caused significant damage to buildings and properties.
4. **Elian Karel's Death in Jail (June 20, 2009):**
   - Elian Karel, a member of POK, died while incarcerated at Abila City Prison. He was expected to be released shortly before his death.
5. **GAStech and Environmental Issues:**
   - In 1995, GAStech's HASR technique for gas extraction was banned in Tethys due to environmental concerns about water contamination.
   - President Araullo announced plans to build a new Abila General Hospital in 1995.
These events suggest that GAStech employees were targeted in a kidnapping incident on January 20, 2014. The mystery jet and the construction site incident are likely related. 
Additionally, POK's history of violent behavior and Elian Karel's death may provide context for potential involvement or connections to these events.'''
_article_timeline_2_ = '''
The situation involving GAStech and related entities in Kronos appears to be marked by several interconnected factors:
1. **Recent Incidents (2014):** On January 20, 2014, GAStech employees were targeted in a kidnapping and a robbery, with "dark clothed" figures involved. These events suggest ongoing conflict or danger around GAStech's operations.
2. **Historical Context:** In 1995, President Araullo announced the construction of a new hospital due to issues at the old facility. The current incidents involving GAStech employees may be unrelated but highlight past infrastructure concerns.
3. **Political Tensions:** POK (Protectors of Kronos) has been labeled a public threat for violent actions, potentially linking them to the recent kidnappings as part of broader conflicts with the government.
4. **Tragedy and Implications:** Elian Karel's death in 2009 while incarcerated might have influenced POK's activities or government handling, contributing to current tensions.
5. **Environmental Concerns:** GAStech's banned technique from 1995 could have left a legacy of environmental issues, making them targets for attacks due to past controversies.
6. **Coordinated Efforts:** The clustering of events on January 20, 2014, suggests possible coordination against GAStech by groups like POK, amid political and environmental tensions.
In summary, the incidents reflect a complex interplay of political unrest, environmental issues, and potential corruption, leading to violent actions against GAStech. 
The government's crackdown on POK may further escalate these tensions.'''
_article_timeline_3_ = '''
It seems like you've provided several news articles spanning different dates and topics, some of which mention the Protectors of Kronos (POK), GAStech employees, and other related events. Based on the content of these articles, here is a summary of the key points:
1. **GAStech Employees Kidnapped**: On January 20, 2014, media reports indicated that several GAStech employees were kidnapped. The identity of the passengers on a mysterious jet departing from Abila Airport remains unknown.
2. **Elian Karel's Death**: In June 2009, Elian Karel, a member of the Protectors of Kronos (POK), died while unlawfully incarcerated at Abila City Prison. He was expected to be released soon and return to his work with POK.
3. **POK Labeled a Public Threat**: In July 2011, government spokesman Rufus Drymiau declared that POK posed a public threat due to violent demonstrations, vandalism, and conflicts with police. The group caused significant damage to buildings and businesses, showing disregard for the safety of others. President Kapelou vowed not to let their actions go unchecked.
4. **GAStech's Prohibited Technique**: In April 1995, the government of Tethys banned GAStech's innovative technique called Hyperactive Acid Substratum Retirement (HASR) due to concerns about water contamination from waste produced during the process.
5. **New Hospital Construction**: President Araullo announced in November 1995 that a new modern Abila General Hospital would be built, replacing the outdated facility constructed in 1930.
6. **Dark-Clothed Figures and GAStech Employees**: On January 20, 2014, "dark clothed" figures were observed at what appeared to be a robbery or event targeting GAStech construction employees. The exact number of people involved was unclear due to confusion during the evacuation.
### Possible Connections:
- **POK's Violent History**: POK has been associated with violent demonstrations and vandalism, which could potentially link them to incidents involving GAStech employees.
- **GAStech Incidents**: The 2014 reports about kidnapped GAStech employees and mysterious activities at the construction site suggest that there might be ongoing issues or conflicts related to GAStech's operations.
If you have specific questions about these articles or need further analysis, feel free to ask!'''


# Sourcing The Timeline To Articles
Feeding the model all of the articles and asking it to source the kidnapping event did not work.  Instead, you have to feed the model each article separately and ask it to score it for relevance related to the task.  The results from the "separate article" task looked reasonable... i.e., keyword analysis like "kidnap" seemed to correlate with the articles marked as "relevant".

In [None]:

# Give some sourcing for the January 20 Kidnapping event...  copying the event from the above...
_sourcing_task_ = '''Provide specific article sourcing for the the following event.  
The sourcing should include the article title, the article published date, and an excerpt of the relevant text:\n'''
_event_info_ = '''On January 20, 2014, GAStech employees were targeted in a kidnapping and a robbery, with "dark clothed" figures involved.'''
#_results_ = promptModel(_sourcing_task_ + '\n\n' + _event_info_ + '\n\n' + _all_articles_) # 2 failed attempts ... had to focus the event to just the kidnapping part... but still had 4 failure after that
#print(_results_)
_sourcing_task_ = '''The collection of articles paints a complex picture of life in Kronos, particularly focusing on Abila City, its challenges, and the activities of various groups and organizations such as GAStech, POK (Protectors of Kronos), and government entities. 
Below is an analysis of the key points, context, patterns, connections, and implications.
---
### **Key Points from the Articles**
1. **GAStech Employees Kidnapped (2014):**
   - Media reports indicate that GAStech employees were kidnapped, and a mysterious jet departed Abila Airport on January 20, 2014.
   - A blog post describes "dark clothed" individuals involved in what appears to be a robbery targeting GAStech construction employees.
2. **POK as a Public Threat (2011):**
   - The government labeled POK (Protectors of Kronos) as a violent group responsible for property damage and conflicts with police.
   - POK's activities suggest ongoing tensions between the organization and authorities, possibly linked to broader social or political grievances.
3. **Abila General Hospital Crisis (2014):**
   - The hospital is overcrowded, understaffed, lacks essential equipment, and struggles with security issues.
   - Plans for a new hospital announced in 1995 were never realized, highlighting long-standing infrastructure neglect.
4. **Environmental Concerns and Industrial Practices:**
   - A 1995 article discusses the banning of GAStech's HASR (Hydraulic Accelerated Subterranean Recovery) technique in Tethys due to water contamination risks.
   - This suggests a history of environmental and industrial concerns tied to GAStech’s operations.
5. **Political and Social Unrest:**
   - The death of Elian Karel in Abila City Prison in 2009, while awaiting release to rejoin POK, indicates tensions between the government and POK.
   - Recent violent incidents involving POK suggest ongoing instability and unrest.
---
### **Context**
- **Infrastructural Neglect:**
  The failure to build a new hospital despite promises made in 1995 reflects systemic issues with governance and resource allocation. This neglect has likely exacerbated public frustration and contributed to the current healthcare crisis.
- **Environmental and Industrial Exploitation:**
  GAStech’s banned technique (HASR) highlights concerns about industrial practices prioritizing profit over environmental and human safety. The fact that the company continued operating elsewhere after losing Tethys' business suggests a pattern of disregarding regulations or local concerns.
- **POK's Role in Social and Political Conflict:**
  POK emerged as a violent group, exploiting social grievances to challenge the government. Their actions suggest a breakdown in trust between the authorities and the public, likely fueled by economic hardship and neglect.
---
### **Patterns**
1. **Cycle of Neglect and Violence:**
   - Years of infrastructural neglect (e.g., failing hospitals) have created fertile ground for social unrest.
   - POK capitalized on these grievances, leading to violent clashes with authorities.
2. **GAStech’s Troubled History:**
   - GAStech has faced repeated accusations of environmental harm and exploitation, suggesting a disregard for regulations or local concerns.
   - The company's operations appear tied to broader economic interests that may prioritize profit over public welfare.
3. **Government Inaction and Unrest:**
   - The government’s failure to address long-standing issues (e.g., healthcare, infrastructure) has likely fueled frustration and violence.
   - The mysterious circumstances surrounding Elian Karel’s death and the lack of accountability further erode trust in the authorities.
---
### **Connections**
1. **Infrastructural Neglect and POK’s Rise:**
   - The government’s failure to improve public services (e.g., healthcare) likely contributed to the grievances that POK exploited, fueling their rise as a violent opposition group.
2. **GAStech as a Target for Violence:**
   - GAStech’s operations may have become a flashpoint for broader frustrations with industrial exploitation and government inaction.
   - The 2014 kidnappings and robberies could be linked to POK or other groups seeking to disrupt perceived symbols of economic oppression.
3. **Environmental Exploitation and Economic Decline:**
   - GAStech’s banned technique and ongoing operations suggest a disregard for environmental sustainability, which may have contributed to economic instability and public anger.
---
### **Implications**
1. **Ongoing Instability in Kronos:**
   - The interconnected issues of infrastructural neglect, environmental exploitation, and violent social movements suggest that Kronos is facing significant challenges to stability.
   - Without meaningful reforms, tensions between the government and groups like POK are likely to escalate.
2. **Failure of Governance:**
   - The government’s inability to address long-standing issues highlights a systemic failure in governance, which may lead to further erosion of public trust and increased violence.
3. **Potential for Escalation:**
   - If GAStech continues to face attacks and POK remains unchecked, the situation could deteriorate into open conflict, with severe consequences for Kronos’ citizens and economy.
---
### **Conclusion**
The articles reveal a Kronos struggling with systemic issues of neglect, exploitation, and violence. 
The government’s failure to address these challenges has allowed groups like POK to emerge as violent opposition forces, while GAStech’s controversial operations have become symbols of broader economic and environmental grievances. 
Without significant reforms, Kronos risks further instability and conflict.'''
_sourcing_task_2_ = '''Here are summaries of the provided articles:
---
### **Article Title: "P.O.K Labeled a Public Threat"**
**Date:** 2011-07-28  
**Summary:** Government spokesman Rufus Drymiau declared that the Protectors of Kronos (POK) pose a public threat due to their violent demonstrations, opportunistic vandalism, and clashes with police. POK has caused thousands of dollars in damages to buildings and businesses, showing disregard for the safety of others. President Kapelou vowed not to allow such violence to go unchecked.
---
### **Article Title: "ELIAN KAREL DEAD IN KRONOS JAIL"**
**Date:** 2009-06-20  
**Summary:** Elian Karel, a 28-year-old member of the Protectors of Kronos (POK), died while in custody at Abila City Prison. He was due to be released shortly and had expressed excitement about resuming his work with POK. Calls to the President's office, the Ministry of Justice, and the Police Chief were not immediately returned.
---
### **Article Title: "Araullo Announces New Hospital Construction"**
**Date:** 1995-11-22  
**Summary:** President Araullo announced plans to construct a new modern Abila General Hospital to replace the outdated facility built in 1930, which had experienced multiple electrical fires. The project was part of broader efforts to upgrade living standards and modernize Kronos.
---
### **Article Title: "TECHNIQUE THAT IT PERFORATES PROHIBITED IN TETHYS"**
**Date:** 1995-04-01  
**Summary:** The government of Tethys banned GAStech's innovative drilling technique, the International Hyperactive Acid Retirement (HASR), due to concerns about water contamination. Despite its effectiveness for gas extraction, the environmental risks were deemed unacceptable. GAStech responded that the loss of Tethys would be offset by operations in other countries.
---
### **Article Title: "VOICES - a blog about what is important to the people"**
**Date:** 2014-01-20  
**Summary:** Media reports indicated that several GAStech employees were kidnapped. The identities of passengers on a mysterious jet departing from Abila Airport earlier the same day remained unknown.
---
### **Article Title: "ON SCENE BLOG"**
**Date:** 2014-01-20  
**Summary:** A blog post reported that dark-clothed figures were spotted at a GAStech construction site, raising suspicions of potential robbery or other criminal activity. This coincided with reports of GAStech employees being kidnapped.
---
### **Key Themes Across Articles:**
1. **GAStech Incidents:** Multiple reports suggest ongoing issues involving GAStech employees, including kidnappings and mysterious activities.
2. **Protectors of Kronos (POK):** The group is portrayed as a violent threat to public safety, with Elian Karel's death highlighting their controversial role.
3. **Infrastructure Development:** Efforts to modernize Abila General Hospital reflect broader goals for improving living standards in Kronos.
Let me know if you'd like further analysis or clarification!'''
_sourcing_task_3_ = '''The articles you've provided span several decades and cover a range of topics related to political movements, environmental issues, and violent incidents in Kronos. Here's an attempt to provide context and connections between them:
### 1. **GAStech Kidnappings (20 Jan 2014)**
   - The articles from January 20, 2014, suggest that there was a significant incident involving the kidnapping of GAStech employees. This appears to be part of a larger event where "dark clothed" figures were involved in what seemed like a robbery or attack targeting GAStech construction workers. The identity of the perpetrators and the exact number of victims remain unclear, but it seems to have been a well-coordinated operation.
   - This incident could be linked to broader tensions in Kronos, possibly related to political movements like the Protectors of Kronos (POK), which is mentioned elsewhere.
### 2. **ELIAN KAREL DEAD IN KRONOS JAIL (June 20, 2009)**
   - Elian Karel was a member of the Protectors of Kronos (POK) who died in custody while awaiting release from Abila City Prison. His death occurred despite his expected freedom and return to work with POK. This suggests that there may have been issues within the justice system or prison conditions, as well as potential tensions between POK and the government.
### 3. **POK Labeled a Public Threat (July 28, 2011)**
   - By 2011, POK had evolved into a group engaged in violent demonstrations, vandalism, and conflicts with police. The government labeled them a public threat, citing thousands of dollars in damages to buildings and ventures. This escalation in violence could explain the tensions that led to later incidents like the GAStech kidnappings.
### 4. **Environmental Controversy (1995)**
   - An older article mentions the prohibition of a technique called HASR (Highly Active Solvent Recovery) by GAStech in Tethys due to environmental risks. This indicates that GAStech has been involved in controversial projects with potential ecological impacts, which could have made them a target for groups like POK.
### 5. **Connections Between Articles**
   - The articles suggest a timeline of escalating tensions:
     - In the mid-1990s, GAStech was involved in environmentally contentious projects.
     - By 2009, Elian Karel, a POK member, died in custody, possibly indicating growing friction between POK and authorities.
     - By 2011, POK had become violent, leading to increased scrutiny from the government.
     - In 2014, tensions may have boiled over into a direct attack on GAStech employees, potentially as part of POK's activities or other related movements.
### 6. **Potential Motivations**
   - The motivations behind the GAStech kidnappings could be linked to opposition against their projects (e.g., environmental concerns), political grievances against the government, or internal conflicts within POK.
   - The fact that multiple articles mention POK's violent activities and their eventual labeling as a public threat suggests that this group may have played a significant role in the 2014 incident.
### Conclusion
These articles paint a picture of Kronos as a region with deep-seated tensions involving political movements, environmental issues, and violent conflicts. 
The 2014 GAStech kidnappings appear to be part of a broader narrative of conflict between groups like POK and the government or corporate entities like GAStech. 
Understanding the full context would require more detailed information about the relationships between these groups and their motivations.
'''

In [None]:
_sourcing_task_2_ = '''For each of the following articles, label them as "relevant" or "not relevant" for the following events.  Every article title (and published date) needs to be labeled:\n'''
_event_info_ = '''On January 20, 2014, GAStech employees were targeted in a kidnapping and a robbery, with "dark clothed" figures involved.'''
#_results_ = promptModel(_sourcing_task_2_ + '\n\n' + _event_info_ + '\n\n' + _all_articles_) # 3 failed attempts... none were even close to producing the article list w/ labels
#print(_results_)

In [None]:
# 1600 seconds in total (for the same article... over and over again... so that number isn't accurate for articles of different lengths)
# ... 1622s for the articles (when the code is written to correctly test each article)
# ... however, maybe the accuracy is off... roughly 1/3 are labeled as "true" while the others are "false" ... 76/845 contain the word kidnap
# ... ... maybe 85 / 845 if you add in other words for kidnap (but then you are possibly getting other activity/events...)
_article_relevance_filename_    = _dir_ + '/20250212_article_relevance_per_deepseek32b.parquet'
df_article_relevance_deepseek_1 = pd.read_parquet(_article_relevance_filename_)
_article_relevance_filename_    = _dir_ + '/20250212_article_relevance_per_deepseek32b_2.parquet'
if os.path.exists(_article_relevance_filename_) == False:
    from   pydantic import BaseModel
    import time
    class ArticleRelevance(BaseModel):
        article_title: str
        article_date:  str
        relevant:      bool
    arts, responses, response_times = [], [], []
    for i in range(len(_articles_)):
        prompt = """For the following article, determine it's relevance to the following event:\n\n""" + _event_info_ + "\n\nArticle:\n\n" + _articles_[i]
        t0 = time.time()
        response: ChatResponse = chat(model=model, messages=[{ 'role': 'user', 'content':  prompt,},], format=ArticleRelevance.model_json_schema())
        response_times.append(time.time() - t0)
        article_relevance_response = ArticleRelevance.model_validate_json(response['message']['content'])
        responses.append(article_relevance_response), arts.append(_articles_[i])
        print('.',end='')
        if len(responses)%50 == 0: print(f'{len(responses)}/{len(_articles_)}')    
    _lu_ = {'title':[], 'date':[], 'article':[], 'relevant':[], 'time_taken':[]}
    for i in range(len(responses)): 
        _lu_['title'].append(responses[i].article_title), _lu_['date'].append(responses[i].article_date), _lu_['article'].append(arts[i]),
        _lu_['relevant'].append(responses[i].relevant), _lu_['time_taken'].append(response_times[i])
    _df_ = pd.DataFrame(_lu_)
    _df_.to_parquet(_article_relevance_filename_)
    _df_ = None
df_article_relevance_deepseek_2 = pd.read_parquet(_article_relevance_filename_)
params = {'df':df_article_relevance_deepseek_2, 'color_by':'relevant', 'w':64,'h':96}
#rt.tile([rt.pieChart(**params), rt.histogram(bin_by='relevant', **params)])

In [None]:
# 273 seconds for llama3.2:3b ... but similar # of results -- maybe a 1/3 were labeled as relevant
_article_relevance_filename_      = _dir_ + '/20250212_article_relevance_per_llama32_3b.parquet'
df_article_relevance_llama32_3b_1 = pd.read_parquet(_article_relevance_filename_)
_article_relevance_filename_      = _dir_ + '/20250212_article_relevance_per_llama32_3b_2.parquet'
if os.path.exists(_article_relevance_filename_) == False:
    from   pydantic import BaseModel
    import time
    class ArticleRelevance(BaseModel):
        article_title: str
        article_date:  str
        relevant:      bool
    arts, responses, response_times = [], [], []
    for i in range(len(_articles_)):
        prompt = """For the following article, determine it's relevance to the following event:\n\n""" + _event_info_ + "\n\nArticle:\n\n" + _articles_[i]
        t0 = time.time()
        response: ChatResponse = chat(model='llama3.2:latest', messages=[{ 'role': 'user', 'content':  prompt,},], format=ArticleRelevance.model_json_schema())
        response_times.append(time.time() - t0)
        article_relevance_response = ArticleRelevance.model_validate_json(response['message']['content'])
        responses.append(article_relevance_response), arts.append(_articles_[i])
        print('.',end='')
        if len(responses)%50 == 0: print(f'{len(responses)}/{len(_articles_)}')    
    _lu_ = {'title':[], 'date':[], 'article':[], 'relevant':[], 'time_taken':[]}
    for i in range(len(responses)): 
        _lu_['title'].append(responses[i].article_title), _lu_['date'].append(responses[i].article_date), _lu_['article'].append(arts[i]),
        _lu_['relevant'].append(responses[i].relevant), _lu_['time_taken'].append(response_times[i])
    _df_ = pd.DataFrame(_lu_)
    _df_.to_parquet(_article_relevance_filename_)
    _df_ = None
df_article_relevance_llama32_3b_2 = pd.read_parquet(_article_relevance_filename_)
params = {'df':df_article_relevance_llama32_3b_2, 'color_by':'relevant', 'w':64,'h':96}
#rt.tile([rt.pieChart(**params), rt.histogram(bin_by='relevant', **params)])

In [None]:
_df1_ = df_article_relevance_deepseek_2  .set_index(['title','date','article']).rename({'relevant':'relevant_deepseek',   'time_taken':'time_taken_deepseek'},  axis=1)
_df2_ = df_article_relevance_llama32_3b_2.set_index(['title','date','article']).rename({'relevant':'relevant_llama32_3b', 'time_taken':'time_taken_llama32_3b'},axis=1)
df_article_relevance_combined = _df1_.join(_df2_, how='inner').reset_index()
df_article_relevance_combined = rt.columnsAreTimestamps(df_article_relevance_combined, 'date')
contains_kidnap = []
for i in range(len(df_article_relevance_combined)): contains_kidnap.append('kidnap' in df_article_relevance_combined['article'][i].lower())
df_article_relevance_combined['contains_kidnap'] = contains_kidnap

In [None]:
#rt.smallMultiples(df_article_relevance_combined, category_by=['relevant_deepseek','relevant_llama32_3b'], color_by='contains_kidnap', sm_type='xy',
#                  sm_params={'x_field':'time_taken_deepseek','y_field':'time_taken_llama32_3b'}, w_sm_override=160, h_sm_override=160, grid_view=True)

In [None]:
#
# This doesn't look unreasonable -- most of the hits are around the timeline for the kidnapping...
#
#rt.tile([rt.temporalBarChart(df_article_relevance_combined, color_by='contains_kidnap', w=1536, h=256),
#         rt.xy(df_article_relevance_combined.dropna(), x_field='date', y_field=['contains_kidnap','article'], color_by='contains_kidnap', w=1536, h=256)], horz=False, spacer=5)

# Determined Who Was Kidnapped // News Article Version
This worked very poorly... it mostly just seems to extract out peoples names from the articles passed to it...  however, looking at the original challenge scope, it seems that the names are likely not in the articles but are in the social media and emergency service data.

In [None]:
# Who was kidnapped? // according to the articles...
_article_relevance_combined_w_names_filename_ = _dir_ + '/20250222_article_relevance_combined_w_names.json'
#
# Who was kidnapped?
# - this works very poorly ... it mostly just seems to extract out peoples names from the articles passed to it...
#
if os.path.exists(_article_relevance_combined_w_names_filename_) == False:
    class ListOfNames(BaseModel):
        names: list[str]
    names_column = []
    for i in range(len(df_article_relevance_combined)):
        if df_article_relevance_combined['relevant_deepseek'][i] == True:
            _prompt_ = """For the following article, determine who was kidnapped.  Return a Python list of names.\n\n""" + df_article_relevance_combined['article'][i]
            response: ChatResponse = chat(model=model, messages=[{ 'role': 'user', 'content':  _prompt_,},], format=ListOfNames.model_json_schema())
            names = ListOfNames.model_validate_json(response['message']['content']).names
            if len(names) == 0: names = None
        else: 
            names = None
        names_column.append(names)
    df_article_relevance_combined['names'] = names_column
    df_article_relevance_combined.to_json(_article_relevance_combined_w_names_filename_)
df_article_relevance_combined = pd.read_json(_article_relevance_combined_w_names_filename_)
for i in range(len(df_article_relevance_combined)):
    _list_ = df_article_relevance_combined['names'][i]
    if _list_ is not None:
        ...
        #print(df_article_relevance_combined['article'][i])
        #print(_list_)

# Determined Who Was Kidnapped // Social Media Version
Two findings...
1. When the data was provided as it was given, the model focused on an explosition at the Dancing Dolphin Hotel.
2. Considering that maybe there was an issue with the context window (too short, as some online had indicated), randomizing the order of the data produced a completely different response -- that response was to address how the data *may* be analyze...  so... the next step is to do some 'needle in a haystack' testing

In [None]:
_social_media_data_description_ = '''The data includes two types of records.

Microblog messages (mbdata). The microblog messages included here use some of the conventions you may be familiar with from social media. The @ symbol is used to designate a username within the body of a message. Hashtags (#) are used to relate the message to specific topics. "RT" at the start of a message indicates that it the current user is re-sending another user's message. Spam and junk messages are also common.

In this data, messages may be created in small batches, so it is possible for multiple messages from an individual to have the exact same date/timestamp. It is also possible for messages to be delivered slightly out of order. 

The format of the microblog messages is as follows:
   date: in YYYYMMDDHHMMSS format. For example, July 8, 2014 at 11:59 pm would be 20140708235900.
   author: author name, a text string
   message: the message string, containing up to 200 characters of content
   latitude: latitude from which the message was sent (optional)
   longitude: longitude from which the message was sent (optional)

Call center data (ccdata). Emergency call center data is included. The format of these records is as follows:
   date: in YYYYMMDDHHMMSS format. For example, July 8, 2014 at 11:59 pm would be 20140708235900.
   message: the message string, containing up to 200 characters of content
   location: the street or cross streets at which the incident took place (optional)'''
import random
import copy
def randomizeOrder(_lines_):
    _lines_ = copy.deepcopy(_lines_)
    new_lines = []
    while len(_lines_) > 0:
        _index_ = random.randint(0, len(_lines_) - 1)
        new_lines.append(_lines_.pop(_index_))
    return new_lines
social_media_lines = []
_mc3_dir_ = '../../../data/2014_vast/MC3'
_files_   = ['csv-1700-1830.csv','csv-1831-2000.csv','csv-2001-2131.csv']
for _file_ in _files_:
    _file_ = _mc3_dir_ + '/' + _file_
    with open(_file_, encoding='iso-8859-1') as f:
        _lines_ = f.readlines()
        social_media_lines.extend(_lines_)
print(f'{len(social_media_lines)=}')
_prompt_   = 'Given the following data, determine likely people who may have been kidnapped.  Return a Python list of names.\n\n' + _social_media_data_description_ + '\n\n' + '\n'.join(randomizeOrder(social_media_lines))
_who_was_kidnapped_social_media_1_ = '''<think>
Okay, so I'm looking at this problem where I need to analyze a bunch of social media and news feeds related to an event. The task is to extract the most relevant pieces of information, figure out what's happening, identify the key participants, determine the location, assess the severity, and then come up with a summary.

Alright, first step: Extracting relevant information from the data. Let me go through each entry and see what stands out. I notice that many entries are from different sources like mbdata and ccdata. Some of them mention an explosion at the Dancing Dolphin apartment building. Others talk about people being down or hurt. There's also a mention of a misdemeanor assault in progress near some streets, S. Mikonou St and S. Achilleos St.

Next, identifying what's happening. The main event seems to be an explosion that occurred inside an apartment building called Dancing Dolphin. Several people were affected—down could mean injured or maybe just lying down after the shock. There are RTs (retweets) indicating multiple reports from different accounts about this incident. Also, some entries show concern and panic like "ohno" repeated several times.

Key participants: The main sources reporting seem to be @KronosStar, @CentralBulletin, @AbilaPost, and others. These might be news outlets or community accounts. There are mentions of people like aliceRocks and AbilaAllFaith expressing concern. The ccdata entry mentions a misdemeanor assault in progress nearby.

Location: From the data, it looks like the explosion is at the Dancing Dolphin apartment building. The streets mentioned near the assault are S. Mikonou St and S. Achilleos St. So, the location of the explosion is likely at the intersection or near these streets.

Severity: Several people being down suggests that there are injuries involved. The fact that it's trending with multiple retweets indicates that this is a significant event. The ccdata entry about an assault in progress adds another layer, possibly related to the chaos after the explosion.

Putting this together, I think the situation involves an explosion at an apartment building which caused injuries and panic. There's also an ongoing assault nearby, maybe connected to the incident or happening separately but in the same vicinity.

For the summary, it should capture all these elements concisely: the event (explosion), location (Dancing Dolphin, near S. Mikonou St and S. Achilleos St), participants (news sources and community accounts), severity (injuries, panic), and context of an assault nearby.

I need to make sure that the summary is clear and includes all these points without getting too bogged down in the details of each tweet.
</think>

**Event Summary: Explosion at Dancing Dolphin Apartment Building**

- **Event:** An explosion occurred at the Dancing Dolphin apartment building, resulting in injuries and panic among residents and witnesses. 
- **Location:** The incident took place near the intersection of S. Mikonou St and S. Achilleos St.
- **Participants:** News outlets such as @KronosStar, @CentralBulletin, and @AbilaPost reported the event, along with community accounts like aliceRocks and AbilaAllFaith expressing concern.
- **Severity:** Multiple individuals were injured or affected by the explosion, leading to widespread panic. 
- **Additional Context:** A misdemeanor assault was reported in progress nearby, potentially related to the chaos following the explosion.

This incident highlights a significant event with implications for public safety and community response.'''
_who_was_kidnapped_social_media_2_ = '''<think>
Okay, so I'm trying to understand this data. It looks like a bunch of tweets or social media posts from different users talking about something happening at the Dancing Dolphin apartment. There are mentions of explosions, people being down, and some hashtags like #DancingDolphinFire and #AFDHeroes. 

First, I notice that many accounts are retweeting messages from KronosStar and CentralBulletin. They mention an explosion inside the apartment building and that several people are injured or down. Then there's a ccdata entry about a misdemeanor assault in progress at S. Mikonou St / S. Achilleos St. That seems related because it's the same location where the explosion was reported.

I'm wondering if the explosion caused the assault, or maybe the assault is separate but happening nearby. Also, many users are reacting with concern and fear, using words like "ohno" and asking what happened. Some are praying for help, which suggests that the situation is serious and possibly dangerous.

There's also a MISDEMEANOR ASSAULT-IN PROGRESS in the ccdata line. That might mean there's an ongoing incident where someone is being assaulted, maybe related to the explosion. I'm not sure if it's part of the same event or a separate issue at the same location.

I need to figure out what exactly happened. Was there an explosion that caused injuries and led to an assault? Or was the assault happening around the time of the explosion, but they're unrelated? The fact that both are tagged with locations suggests they might be connected.

Also, looking at the hashtags #DancingDolphinFire and #AFDHeroes, it seems like there's a fire involved and first responders (AFD) are involved. So maybe it was an accident or a deliberate act causing a fire and explosion, leading to injuries and possibly an assault situation.

I should consider the timeline. The explosion was reported at 21:30:10, followed by ccdata about the assault at 21:30:10 as well. So they happened around the same time. That makes me think it's a single event with multiple aspects—explosion leading to injuries and an assault in progress.

Some users are retweeting information, which means the news is spreading quickly, but I need to verify if all these tweets are about the same incident or if some are unrelated. The fact that many are retweeting from specific accounts suggests it's a coordinated event.

I'm also seeing accounts like AbilaPost and CentralBulletin reporting on the explosion, so they might be local news sources or community pages sharing updates. This adds credibility to the information.

In summary, it seems there was an explosion at the Dancing Dolphin apartment building around 9:30 PM, causing injuries (several people down), possibly a fire, and an ongoing assault in progress nearby. The situation is serious, with emergency services likely involved.
</think>

The incident at the Dancing Dolphin apartment building occurred on January 23, 2014, at approximately 9:30 PM. Here's a structured summary of the event:

1. **Explosion and Injuries**: 
   - An explosion was reported inside the apartment building, resulting in several injuries. The incident was widely shared on social media with hashtags #DancingDolphinFire and #AFDHeroes, indicating involvement of fire and emergency services.

2. **Location**:
   - The explosion occurred at S. Mikonou St / S. Achilleos St, the same location where a misdemeanor assault in progress was reported around the same time.

3. **Timeline and Spread**:
   - Both the explosion and assault were reported at 21:30:10, suggesting they may be related or part of a single incident.
   - The information spread rapidly through retweets from multiple accounts, with local news sources like AbilaPost and CentralBulletin providing updates.

4. **Public Reaction**:
   - Users expressed concern and fear, with some praying for help, indicating the severity and danger of the situation.

5. **Assessment**:
   - The event likely involved an explosion leading to injuries, possibly accompanied by a fire. The assault in progress may be connected, though it's unclear if it was a separate incident or part of the same event.

In conclusion, the Dancing Dolphin apartment experienced a serious incident involving an explosion, injuries, and an assault, prompting emergency responses and community concern.'''

# The following were after the randomization were added...

_who_was_kidnapped_social_media_3_ = '''<think>
Okay, so I need to figure out how to extract the top 5 most frequent hashtags from a list of tweets. The data given is a bunch of lines where each line represents a tweet, and the structure is something like "tweet_id | username | content". Each tweet might have multiple hashtags separated by spaces.

First, I should think about how to process this data. Since it's text-based, maybe I can read through each line, split the content into parts, and then identify which parts are hashtags. Hashtags usually start with a '#', so any word starting with '#' is a hashtag.

I guess I need to loop through each tweet, extract all the hashtags from the content, and count how many times each hashtag appears across all tweets. Then, after collecting all counts, sort them in descending order to find the top 5.

Wait, but how do I split the content into words? Maybe using a string split method on spaces. But sometimes people use punctuation or other characters, so maybe a regular expression would be better to extract only the hashtags properly. Hmm, perhaps using regex to find all occurrences of '#' followed by word characters.

Once I have all the hashtags extracted from each tweet, I can update a dictionary where keys are hashtags and values are their counts. For each hashtag found in a tweet, increment its count by one.

After processing all tweets, I'll sort this dictionary by value (the count) in descending order and pick the top 5 entries. If there are more than five unique hashtags, that's fine; if fewer, just take as many as available.

I should also consider case sensitivity. Are hashtags case-sensitive? For example, is #POKRally different from #pokrally? I think in real social media, hashtags are case-insensitive, but for the sake of this problem, unless specified otherwise, maybe treat them as case-sensitive since they're represented exactly as such in the content.

Wait, looking at the sample data provided earlier, some hashtags like #POKRally and #HI appear multiple times. So I need to count each occurrence accurately.

Another thing: some tweets might not have any hashtags. In that case, I just skip those tweets when extracting.

So, putting it all together, here's a step-by-step plan:

1. Read each line of the data.
2. For each tweet, split into parts using '|' as delimiter to get the content part.
3. From the content, extract all hashtags using regex or string operations.
4. For each hashtag found, update a counter in a dictionary.
5. After processing all tweets, sort the dictionary by counts descendingly.
6. Select the top 5 hashtags based on their counts.

I think that's manageable. Now, considering implementing this in code if I were to write it, but since the user is asking for an explanation of the thought process, I can stop here unless they need more details.

Wait, maybe I should also consider any edge cases. For example, a tweet with no hashtags shouldn't affect the counts. Also, ensure that leading or trailing characters don't interfere when extracting hashtags. Maybe using regex would handle this better, ensuring that only valid hashtag patterns are captured.

So perhaps using something like re.findall(r'#\\w+', content) to extract all hashtags in each content string. This way, it ignores any non-word characters following the '#', and captures the exact hashtag without including other symbols or spaces.

Another point: sometimes people put hashtags at the end of sentences with punctuation, like "Hello! #Example." The regex would correctly capture "#Example" as a hashtag, ignoring the period.

So, to sum up, my approach is:

- Extract all hashtags from each tweet using regex.
- Count their occurrences.
- Sort and pick top 5.

I think that's solid. Now, if I were to write code for this, it would involve reading lines, processing each content, extracting hashtags, updating counts, then sorting and displaying the top 5. But since the user just wants the thought process, I think I've covered all bases here.
</think>

To determine the top 5 most frequent hashtags from the given tweets, follow these steps:

1. **Extract Hashtags**: Use regular expressions to identify all substrings starting with '#' followed by word characters in each tweet's content.

2. **Count Frequencies**: Maintain a dictionary where keys are hashtags and values are their occurrence counts across all tweets.

3. **Sort and Select Top 5**: Sort the dictionary by counts in descending order and select the top five entries.

**Top 5 Hashtags:**
1. #POKRally
2. #HI
3. #KronosStar
4. #Grammar
5. #AbilaPost

This approach ensures accurate extraction, handling of case sensitivity as per data representation, and efficient counting to identify the most frequent hashtags.'''
_who_was_kidnapped_social_media_4_ = '''<think>
Alright, let's dive into this user query. They've provided a bunch of data that looks like social media or forum posts from different users. My task is to figure out what they need help with based on this information.

First off, I notice the data has two types: "mbdata" and "ccdata." The mbdata entries seem to be individual posts from users, each with a timestamp, username, message, and some hashtags. The ccdata ones look more like official or system messages, possibly alerts or statuses, without usernames.

Looking at the content, there are mentions of fires in Abila, specifically the Dancing Dolphin fire, and some shootings involving police and a black van. There's also talk about evacuations, negotiations, and various user interactions like sales, comments on photos, and jokes. Some posts seem to be related to promotions or scams, like easy credit cards.

I think the user wants me to analyze this data. Maybe they need help identifying key events, extracting important information, categorizing the posts, or even spotting trends or anomalies. Alternatively, they could be looking for assistance in cleaning up or organizing this data into a more structured format.

Considering the structure of each entry, it looks like each line is a separate event with time, source (mbdata or ccdata), username (if applicable), content, and hashtags. I can see how this might be useful for tracking real-time events or monitoring social media activity during a crisis.

Perhaps they're preparing to create a report or an analysis of the situation in Abila, needing help summarizing these posts into actionable insights. Alternatively, they could be working on a project that involves parsing and processing such data feeds, so they might need advice on how to handle it programmatically.

I should ask them about their specific goals. Are they looking for event extraction, trend analysis, sentiment analysis, or something else? Understanding what they want to achieve with this data will help me provide the most effective assistance.
</think>

It seems like you've provided a dataset that includes various types of messages or posts from different users, possibly related to social media, forums, or some other platform. The content appears to be fictional and revolves around events in a place called "Abila," including mentions of fires, shootings, evacuations, promotions, and user interactions.

If you need help analyzing, organizing, or extracting specific information from this dataset, let me know! For example:

1. **Event Extraction**: Identifying key events (e.g., the fire at Dancing Dolphin, the black van shooter).
2. **Categorization**: Grouping posts by type (e.g., official updates, user comments, promotions, jokes).
3. **Trend Analysis**: Identifying patterns or trends in the data.
4. **Data Cleaning/Processing**: Organizing the data into a more structured format.

Let me know what you'd like to do with this information!'''
_who_was_kidnapped_social_media_5_ = '''<think>
Okay, so I'm trying to figure out how to analyze this data set that's been given. It looks like it's a bunch of tweets or some kind of social media posts about an event happening in Abila. There are two types of data here: ccdata and mbdata. From what I can gather, ccdata seems to be more structured with specific fields like date, time, type, description, and location. On the other hand, mbdata is just a list of entries without clear labels, but each entry has an ID, username, content, maybe some hashtags or URLs, and sometimes latitude and longitude.

First, I need to understand what exactly these datasets represent. The user mentioned that ccdata includes "ALARM-SECURE NO CRIME" and other crime-related types, so it seems like official reports of incidents. Meanwhile, mbdata looks more like social media posts, perhaps from different users discussing the same event or related topics.

The goal here is to analyze these datasets together. I think combining both sources could provide a comprehensive view of what's happening in Abila. But how exactly should I go about this? Maybe by cross-referencing the timestamps and locations between ccdata and mbdata entries, I can see if certain events are being discussed on social media around the same time they're reported officially.

Looking at the ccdata, there are events like "ALARM-SECURE NO CRIME" at a specific address on 2014-03-29 17:22:51. Then in mbdata, I see posts with similar timestamps, like ID=1 from user slamrjamr mentioning something about AbilaPost and dancingdolphin. Maybe these are related to the fire incident mentioned elsewhere.

I should start by categorizing each dataset. For ccdata, it's already categorized into types like ALARM-SECURE, which is a crime type. For mbdata, I can look at hashtags or keywords in the content to classify them. For example, posts mentioning #POK might be related to a specific event or group.

Next, maybe I can plot these events on a timeline. That way, I can see if there's a pattern or correlation between official reports and social media activity. If certain hashtags spike around the time an ALARM-SECURE is reported, that could indicate public reaction to that incident.

I should also consider the locations mentioned in both datasets. The ccdata has exact addresses, while some mbdata entries have coordinates. Maybe mapping these points can show hotspots where incidents are occurring and how they're being discussed online.

Another thought: analyzing sentiment from the mbdata could provide insights into public perception of the events. For example, posts with positive hashtags like #AbilaPrays might indicate support or hope, while others with #terrorists could reflect fear or misinformation.

I wonder if there's a way to link specific mbdata entries to ccdata events using timestamps and locations. For instance, if an ALARM-SECURE is reported at a certain time and place, are there corresponding tweets around that same time mentioning the area? That could help validate the information or show how quickly news spreads online.

I should also look for trends over time. Are there more official reports of crimes during specific hours, and do social media posts reflect increased activity then too? This might help identify peak times for certain incidents.

One challenge I can see is ensuring that the data from mbdata aligns correctly with ccdata entries. Without clear identifiers or tags linking them, it might be hard to make direct connections. Maybe using text analysis to find common keywords or mentions could bridge this gap.

I also need to consider potential biases. Social media data can be skewed towards certain viewpoints, so relying solely on mbdata without context from ccdata might not give a full picture. Combining both should provide a more balanced analysis.

Another point is privacy and ethical considerations. The mbdata includes usernames and sometimes location data. I must ensure that any analysis respects user privacy and doesn't misuse personal information.

Lastly, visualizing the data could be really helpful. Creating graphs or maps to show incident locations and social media activity over time might make patterns more apparent than just looking at raw data.

So, in summary, my approach would involve:
1. Categorizing both datasets based on content and metadata.
2. Cross-referencing timestamps and locations between the two datasets.
3. Analyzing sentiment and keywords in mbdata to gauge public reaction.
4. Mapping incidents geographically to identify hotspots.
5. Looking for trends over time, such as peak incident times or social media activity spikes.
6. Visualizing the findings through graphs, maps, or timelines.
7. Ensuring ethical handling of data to protect user privacy.

I think this structured approach will help in extracting meaningful insights from both datasets and understanding the context of events in Abila.
</think>

To analyze the given datasets effectively, we can follow a structured approach that combines both ccdata (crime reports) and mbdata (social media posts) for comprehensive insights into events in Abila. Here's a step-by-step plan:

### 1. Categorize Datasets
- **ccdata**: Already categorized by event types such as "ALARM-SECURE NO CRIME." Use these categories to understand official incident reports.
- **mbdata**: Classify posts using hashtags, keywords (e.g., #POK), and mentions of specific locations or events.

### 2. Cross-Reference Timestamps and Locations
- **Timeline Analysis**: Create a timeline for both datasets, aligning ccdata events with mbdata posts by date and time to identify correlations.
- **Geospatial Mapping**: Plot incidents from ccdata on a map using addresses and overlay mbdata entries with coordinates to find hotspots and public discussion areas.

### 3. Analyze Sentiment and Keywords
- Use text analysis tools to gauge public sentiment in mbdata, distinguishing between positive (e.g., #AbilaPrays) and negative sentiments.
- Identify trends or spikes in specific hashtags around significant ccdata events to understand public reaction.

### 4. Trend Analysis Over Time
- Examine peak times for both official crime reports and social media activity to identify patterns, such as increased online discussion during high-crime periods.

### 5. Data Visualization
- **Graphs/Timelines**: Visualize event frequencies and public discourse over time.
- **Maps**: Highlight incident locations and social media activity to show geographical trends.

### 6. Ethical Considerations
- Ensure privacy by anonymizing data where necessary, respecting user information and avoiding misuse.

### Conclusion
By integrating these steps, we can uncover how official incidents are perceived and discussed online, providing a balanced view of events in Abila. This approach will help validate data, identify trends, and offer insights into public sentiment and engagement around key events.'''
#_response_ = promptModel(_prompt_)
#print(_response_)

# Needle In A Haystack Testing // Social Media
- Needle in the haystack testing seems to indicate that the model can't pick out the important detail... but digging into it based on information from the model configuration and the online information says that this should be an issue...
    - https://arxiv.org/html/2412.19437v1?ref=platformer.news // paper on NIAH for Deepseek... which looks fine / no issues...
- The social media dataset is approximately 479K characters
    - From the information below... looks like the ollama context for the 32b quantization is 128K (which matches information on deepseek)
    - Four characters per token (on average)... 479K/4 = 119,750 tokens... which means it should be fitting
- Information from the ollama registry for the model

ollama show deepseek-r1:32b
- Model
    - architecture        qwen2     
    - parameters          32.8B     
    - context length      131072    
    - embedding length    5120      
    - quantization        Q4_K_M

In [None]:
# Needle in the Haystack testing w/ the data -- to determine if the context window is an issue or not...
# _prompt_ = 'Which doctor spoke at the POK Rally?  Respond with a single person\'s name.  Use the following data to answer the question.\n\n' + _social_media_data_description_ + '\n\n' + '\n'.join(randomizeOrder(social_media_lines))
_prompt_ = 'Which doctor spoke at the POK Rally?  Respond with a single person\'s name.  Use the following data to answer the question.\n\n' + '\n'.join(randomizeOrder(social_media_lines))
print(f'{len(social_media_lines)=} {len(_prompt_)=}') # should be 4,066 lines and the prompt is a total of 478,944 characters
#_response_ = promptModel(_prompt_)
#print(_response_)
# The correct answer is "Dr. Audrey McConnell Newman"
_answer_incorrect_no_doctor_        = 11
_answer_incorrect_dr_michael_brown_ = 1 # 'brown' does not occur in the data
_answer_incorrect_dr_jane_doe_      = 1 # 'jane' and 'doe' do not occur in the data together (or in the right context for this answer to be accurate)
_answer_correct_                    = 5

# Needle In A Haystack Testing // Social Media (Llama 3.2)
- Did not get any valid answers -- almost all of them were in a category I labeled "Too Much Information" that did not seem relevant to the original query.
- Likely that something in the data is throwing off the instruction to the model...
- Maybe reframing the data into a JSON structure would improve results (but consume more tokens)
    - Currently, the data is csv lines

In [None]:
#model = 'llama3.2'
_prompt_ = 'Which doctor spoke at the POK Rally?  Use the following social media to answer the question.  Respond with a single name only.\n\n' + '\n'.join(randomizeOrder(social_media_lines))
print(f'{len(social_media_lines)=} {len(_prompt_)=}') # should be 4,066 lines and the prompt is a total of 478,944 characters
#_response_ = promptModel(_prompt_)
#print(_response_)
_answer_incorrect_tmi_ = 12