In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch

def modify_summary_with_llm(summary):
    # Check if the MPS device is available
    if torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"

    # Initialize a text generation pipeline with an open-source LLM on the MPS device
    generator = pipeline("text2text-generation", model="t5-small", device=device)

    # Prompt the model to rephrase the summary to avoid unique identifiers
    prompt = (
        f"### Instructions ###"
        f"Rephrase this summary to eliminate the name of the nonprofit, or replace the name with "
        f"'nonprofit', 'organization', 'nonprofit organization', or something similar. "
        f"Also avoid anything that would help a human figure out the name. "
        f"Ensure the summary is not all caps."
        f"### Context ###"
        f"{summary}"
    )

    # Generate a modified summary
    modified_summary = generator(prompt, max_length=300, num_return_sequences=1)[0]['generated_text']
    
    return modified_summary

def ensure_max_200_words(summary):
    words = summary.split()  # Split the summary into words
    # Shorten the summary if it exceeds 200 words
    if len(words) > 200:
        shortened_summary = ' '.join(words[:200]) + '...'
        return shortened_summary
    return summary

def summarize(strings):
    desc, mission_desc, activity, detail = strings

    summary = mission_desc
    if not summary:
        summary = activity
    if not summary:
        summary = detail
    if not summary:
        summary = desc
        
    # Modify the summary to remove unique identifiers
    modified_summary = modify_summary_with_llm(summary)
    
    # Ensure the summary is no more than 200 words
    # final_summary = ensure_max_200_words(modified_summary)

    print('_____________')
    print(strings)
    print('modified_summary:', modified_summary)
    print('^^^^^^^^^^^^^')
    return summary

# strings = [
#     "The nonprofit focuses on providing educational resources to underprivileged communities.",
#     "It organizes workshops and seminars to enhance skills among young adults.",
#     "The organization collaborates with local schools and libraries to expand access to learning materials.",
#     "Volunteer programs are set up to support community development initiatives."
# ]

# summary = merge_and_summarize(strings)
# print("Summary:", summary)

def summarize_nonprofits_with_summary(input_file_path, output_file_path):
    # Read the TSV file using pandas
    df = pd.read_csv(input_file_path, sep='\t')
    
    # Select relevant fields
    fields_to_extract = ['Desc', 'MissionDesc', 'ActivityOrMissionDesc', 'SupplementalInformationDetail']
    relevant_df = df[fields_to_extract]
    
    # Initialize a summaries list with empty strings for all rows
    summaries = [''] * len(df)
    
    for index, row in relevant_df.iterrows():
        # Create a summary for the nonprofit
        summary_parts = []
        for field in fields_to_extract:
            # Append the field value regardless of whether it is NaN or not
            summary_parts.append(row[field] if pd.notna(row[field]) else '')
        
        # Add the summaries to the DataFrame as a new column
        df['summary'] = summarize(summary_parts)
        
    # Save the modified DataFrame back to a TSV file
    df.to_csv(output_file_path, sep='\t', index=False)

# Example usage: provide the path to your TSV file
summarize_nonprofits_with_summary('nonprofits.sample.tsv', 'nonprofits.sample.summaries.tsv')