In [3]:
import os
import pandas as pd
import getpass
from dateutil.relativedelta import relativedelta
from pathlib import Path
import re
import time
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo import ASCENDING
import dateparser
import openai
import nltk
from nltk.tokenize import sent_tokenize
from datetime import datetime
import requests
import random

uri = # MLP mongoDB client
db = MongoClient(uri).ml4p

#Change path of even_prompts based on your directory
prompts_df = pd.read_csv('event_prompts.csv', encoding='ISO-8859-1', index_col=0)
event_prompts = prompts_df.to_dict(orient='index')


# Base directory where the 'result' folder is located - change based on your own directory
results_df = pd.DataFrame(columns=['Country', 'Date', 'Event', 'Num Local', 'Num Int', 'Main Text', 'Events List', 'Misplaced Countries', 'Final Summary'])



# Function to get the most recent subdirectory
def get_most_recent_subdir(directory, num):
    subdirs = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    
    # Adjust the sort key to parse and sort by the dates in the directory names
    subdirs.sort(key=lambda x: datetime.strptime(os.path.basename(x), '%Y-%m-%d'), reverse=True)

    return subdirs[:num] if subdirs else None


def extract_first_two_sentences(text):
    sentences = sent_tokenize(text)
    return ' '.join(sentences[:2])  # Joining the first two sentences

# Function to match country with country's code and source domains - Credit to @Zung-Ru Lin        
def country_sources(country):
    countries = [
        ('Albania', 'ALB'), 
        ('Benin', 'BEN'),
        ('Colombia', 'COL'),
        ('Ecuador', 'ECU'),
        ('Ethiopia', 'ETH'),
        ('Georgia', 'GEO'),
        ('Kenya', 'KEN'),
        ('Paraguay', 'PRY'),
        ('Mali', 'MLI'),
        ('Morocco', 'MAR'),
        ('Nigeria', 'NGA'),
        ('Serbia', 'SRB'),
        ('Senegal', 'SEN'),
        ('Tanzania', 'TZA'),
        ('Uganda', 'UGA'),
        ('Ukraine', 'UKR'),
        ('Zimbabwe', 'ZWE'),
        ('Mauritania', 'MRT'),
        ('Zambia', 'ZMB'),
        ('Kosovo', 'XKX'),
        ('Niger', 'NER'),
        ('Jamaica', 'JAM'),
        ('Honduras', 'HND'),
        ('Philippines', 'PHL'),
        ('Ghana', 'GHA'),
        ('Rwanda','RWA'),
        ('Guatemala','GTM'),
        ('Belarus','BLR'),
        ('Cambodia','KHM'),
        ('DR Congo','COD'),
        ('Turkey','TUR'),
        ('Bangladesh', 'BGD'),
        ('El Salvador', 'SLV'),
        ('South Africa', 'ZAF'),
        ('Tunisia','TUN'),
        ('Indonesia','IDN'),
        ('Nicaragua','NIC'),
        ('Angola','AGO'),
        ('Armenia','ARM'),
        ('Sri Lanka', 'LKA'),
        ('Malaysia','MYS'),
        ('Cameroon','CMR'),
        ('Hungary','HUN'),
        ('Malawi','MWI'),
        ('Uzbekistan','UZB'),
        ('India','IND'),
        ('Mozambique','MOZ'),
        ('Azerbaijan','AZE'),
        ('Kyrgyzstan','KGZ'),
        ('Moldova','MDA'),
        ('Kazakhstan','KAZ'),
        ('Peru','PER'),
        ('Algeria','DZA'),
        ('Macedonia','MKD'), 
        ('South Sudan','SSD'),
        ('Liberia','LBR'),
        ('Pakistan','PAK'),
        ('Nepal', 'NPL'),
        ('Namibia','NAM'),
        ('Burkina Faso', 'BFA'),
        ('Dominican Republic', 'DOM'),
        ('Timor Leste', 'TLS')

    ]

    matching_items = [item[1] for item in countries if item[0] == country]

    if matching_items:
        country_code = matching_items[0]
        

    loc = [doc['source_domain'] for doc in db['sources'].find(
            {
                'primary_location': {'$in': [country_code]},
                'include': True
            }
        )]
    return country_code, loc

def clean_summary(summary):
    # Replace odd characters with a bullet point or any other preferred character
    clean_summary = summary.replace('‚Ä¢', '*')
    
    # Ensure there is a single space after numbers followed by a period
    clean_summary = re.sub(r'(\d+)\.\s*', r'\1. ', clean_summary)


    return clean_summary


# Function to use the API and get summarization of events. Much credit to @Zung-Ru Lin
def gpt_summarization(country, date_str, event, results_df):


    openai.api_key = # Enter your API key


        
    year_month = datetime.strptime(date_str, "%Y-%m-%d")
    year = year_month.year
    month = year_month.month

    # print(country, year, month, event)


    country_code, loc = country_sources(country)
    
    
        

    colname = f'articles-{year}-{month}'
    
    cur = [i for i in db[colname].find({
        'source_domain': {'$in': loc},
        'language': 'en',
        
            
        '$or': [
            {'event_type_civic_new': event},
            {'event_type_civic_new_2': event}
            ],
        f'en_cliff_locations.{country_code}': {'$exists': True}
            
        
    })]
    
    cur1_other = [i for i in db[colname].find({
        'source_domain': {'$in': loc},
        'language': {'$ne':'en'},
        
            
        '$or': [
            {'event_type_civic_new': event},
            {'event_type_civic_new_2': event}
            ],
            f'cliff_locations.{country_code}': {'$exists': True}    
        
    })]
        
    cur.extend(cur1_other)
    

    
    num_local = len(cur)

    international = [i['source_domain'] for i in db['sources'].find({'major_international': True, 'include': True})]
    
    cur2 = [i for i in db[colname].find({
        'source_domain': {'$in': international},
        'language': 'en',
        
            
        '$or': [
            {'event_type_civic_new': event},
            {'event_type_civic_new_2': event}
            ],
        f'en_cliff_locations.{country_code}': {'$exists': True}
            
        
    })]
    
    cur_other = [i for i in db[colname].find({
        'source_domain': {'$in': international},
        'language': {'$ne':'en'},
        
            
        '$or': [
            {'event_type_civic_new': event},
            {'event_type_civic_new_2': event}
            ],
            f'cliff_locations.{country_code}': {'$exists': True}
            
        
    })]
        
    cur2.extend(cur_other)
    num_int = len(cur2)
    

    
    
    if(len(cur) == 0):
        cur = cur2
    
    
        


    sample_size = 200
    success = False

    while not success and sample_size >= 50:
        try:
            sample_size = min(len(cur), sample_size)
            current_sub_docs = random.sample(cur, sample_size)
            extracted_sentences = []
            for doc in current_sub_docs:
                first_two = extract_first_two_sentences(doc['maintext_translated'])
                extracted_sentences.append(f"• {first_two}")

            combined_text = '\n'.join(extracted_sentences)
            formatted_year_month = year_month.strftime("%B %Y")

  
            prompt = event_prompts[event]['prompt_js'].format(event=event, country=country, combined_text=combined_text)

            chat_completion = openai.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}]
            )

            response = chat_completion.choices[0].message.content
            clean_response = clean_summary(response)
            
            new_prompt = f"""Below is a numbered list of recent events reported in the news for {country}. 
            Create a 2-3 sentence paragraph that summarizes these events as objectively as possible without any editorialization. This summary should exclude events that are not directly relevant to {country}. 
            If none of the listed events are relevant to the specific country, explicitly state 'None of the events listed are relevant to the specified country'. 
            The summary should be presented in past tense and exclude the names of individuals involved; it should be a very direct summary without any editorialization at the beginning (such as "a series of {event} have been reported")
            or at the end (such as "These events collectively indicate pervasive issues of").
            Maintain a neutral tone without extrapolating beyond the original descriptions. I want you to remove all editorialization (such as "underscores a broader show of", "creating a significant block to freedom of movement and exacerbating diplomatic tensions", "highlighted pervasive government corruption").
            any sentences and phrases that are not just purely factual and objective delivery of events.
                List:
                {clean_response} """
                
            events3 = clean_response.split('\n')
            misplaced_count = 0
            
            for event2 in events3:
                event2 = event2.strip()
                if event2:
                    
                    start_idx = event.find('(') + 1
                    end_idx = event.find(')')
                    if start_idx != -1 and end_idx != -1:
                        country2 = event2[start_idx:end_idx]
                        # Compare with the focus country
                        if country2.lower() != country.lower():
                            misplaced_count += 1
  
            # Second API Call using the new prompt
            second_chat_completion = openai.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": new_prompt}]
            )
            
            
            second_response = second_chat_completion.choices[0].message.content
            second_clean_response = clean_summary(second_response)

            # print(sample_size, prompt)
            print(second_clean_response)

            new_row = pd.DataFrame({
                'Country': [country], 
                'Date': [date_str], 
                'Event': [event], 
                'Num Local': [num_local],
                'Num Int': [num_int],
                'Main Text': [combined_text],
                'Events List': [clean_response],
                'Misplaced Countries': [misplaced_count],
                'Final Summary': [second_clean_response] 

                })
            
            # print(clean_response)
    
            # Use pandas.concat to append the new row
            results_df = pd.concat([results_df, new_row], ignore_index=True)

            success = True   
        except Exception as e:
            print(f"\n\nError processing {event} with {sample_size} articles. Error: {str(e)}\n\n")
            sample_size -= 3  # Reduce the sample size by 3 and try again
    return results_df






def country_events(results_df, base_dir):
    
    # number of subdirectories in this batch
    num = 1
    subdirs = get_most_recent_subdir(base_dir, num)
    # print(subdirs)
    
    for subdir in subdirs:
        # print(subdir)
        predictions_dir = os.path.join(subdir, 'predictions')
        csv_files = [f for f in os.listdir(predictions_dir) if f.endswith('.csv')]
        current_date = subdir.split('/')[-1]
        

        for csv_file in csv_files:
            #Regex pattern to match the file name and extract the country and lag number
            file_name_pattern = re.compile(r'predictions_lag_(\d+)_([^.]+)\.csv')
            match = file_name_pattern.search(csv_file)
            
            if match:
                lag_number = int(match.group(1))  # Extract the lag number from the file name
                country_name = match.group(2)  # Extract the country name from the file name
                
                if ((lag_number == 1)):
                    # print(country_name)
                    # date_event_dict = {}

                    file_path = os.path.join(predictions_dir, csv_file)
                    df2 = pd.read_csv(file_path)
                    # print(f"Processed {csv_file} for country '{country_name}' with lag number {lag_number} and {len(df2)} rows.")
                    # This selects the rows from the fourth-to-last up to (but not including) the last row which is lag 1 prediction
                    df2_final = df2.iloc[-4:-1] 

                    
                    
                    


                    for index, row in df2_final.iterrows():
                        # print(country_name)
                        date = row.iloc[0]
                        for col in df2.columns[2:]:
                            # print(date, col, row[col])
                            if row[col] == 1:
                                
                                date_obj = datetime.strptime(date, '%Y-%m-%d')
                                month_name = date_obj.strftime("%B")
                                col2 = col.capitalize()
                                # print(f'{col2} in {month_name}')
                                
                                results_df = gpt_summarization(country_name, date, col, results_df)
                                print(f"After update, results_df has {len(results_df)} rows.")

                    

            else:
                print(f"File name {csv_file} does not match the expected pattern.")
                break
    return results_df, subdirs[-1]



base_dir = 'forecast-surges-pipeline/result'
results_df, subdir = country_events(results_df, base_dir)
savePath = os.path.join(base_dir, subdir, 'event_summaries.csv')
print(savePath)
results_df.to_csv(savePath, index=False)
results_df.to_csv('event_summaries.csv', index=False)




In Cape Town, social housing activists submitted 12 applications to the Department of Public Works for state-owned land to be released for affordable housing. U-turn Homeless Ministries and Independent Media launched a campaign to raise R1 million to support the homeless, with volunteers participating in “A Night on the Streets” to highlight homelessness. Animal-rights activists approached the Western Cape High Court to address the unresolved human-baboon conflict in the Cape Peninsula. Gender-based violence organizations rallied outside the Benoni Magistrates Court to support sexual assault victims of a pastor in Daveyton. Activists and civil society in Pietermaritzburg campaigned against the pollution of the Dusi River, urging the SA Human Rights Commission to intervene.
After update, results_df has 1 rows.
The National Health Insurance (NHI) Bill was signed into law by President Cyril Ramaphosa to provide universal health coverage and equal access to healthcare, leading to scrutiny 

In [16]:
print(results_df)

       Country        Date             Event Num_Articles  \
0  South Sudan  2023-10-01          activism           13   
1  South Sudan  2023-10-01        corruption            6   
2  South Sudan  2023-10-01       legalaction           55   
3  South Sudan  2023-11-01          activism           14   
4  South Sudan  2023-11-01             purge           35   
5  South Sudan  2023-12-01  mobilizesecurity           24   

                                              Prompt  \
0  Here is a sample of newspaper articles reporti...   
1  Here is a sample of newspaper articles reporti...   
2  Here is a sample of newspaper articles reporti...   
3  Here is a sample of newspaper articles reporti...   
4  Here is a sample of newspaper articles reporti...   
5  Here is a sample of newspaper articles reporti...   

                                           Summaries  
0  1. **Escalating Cases of Land Grabbing in Juba...  
1  1. Misappropriation of USD 163,000 at Kenya Co...  
2  1. Formatio