In [None]:
import pandas as pd
from pathlib import Path
from PIL import Image
# turn off pandas warnings for this notebook
pd.options.mode.chained_assignment = None
import re


In [None]:
# read the data
df_processed = pd.read_csv("../mock_tg_data_to_inference_pythonic.csv")

In [None]:
# define a function to display mock telegram messages
def display_orig_message(row):
    """
    Display a mock telegram message using image_attachment, message_text, date_time
    """

    # get the image attachment
    image_attachment = Path(row["image_attachment"])
    # get the message text
    message_text = row["message_text"]
    # get the date time
    date_time = row["date_time"]

    # print the message id
    print("MOCK TELEGRAM MESSAGE ID:", row["message_id"])
    # display the message text
    print("MOCK TELEGRAM MESSAGE TEXT:", message_text)
    # display the date time
    print("MOCK TELEGRAM MESSAGE DATETIME:", date_time)
    # display the image attachment
    image = Image.open(image_attachment)
    image.thumbnail((800, 800))
    display(image)

    # print a new line
    print("\n")   
    print("\n")  


In [None]:
# display the first message
display_orig_message(df_processed.iloc[0])

In [None]:
# display all the messages
for index, row in df_processed.iterrows():
    display_orig_message(row)

In [None]:


phrases = ["synagogue", "kehillat", "congregation", "kesher", "tree of life",
"jewish federation", "KNESETH", "jewish center", "beth", "L'SIMCHA", 
"rabbi.+building", # anything starts with rabbi and ends with building
"young israel", 
]


In [None]:

# define a function to match phrases that can be multiple keywords 
# about jewish institutions and return matched terms 
def match_phrases(text, phrases):
    """
    Match phrases that can be multiple keywords about jewish institutions
    and return matched phrases using regex.
    """
    # import regex
    # define a list to hold the matched phrases
    matched_phrases = []
    # loop through the phrases
    for phrase in phrases:
        # if the phrase is in the text
        if re.search(phrase, text, re.IGNORECASE):
            # add the phrase to the list of matched phrases
            matched_phrases.append(phrase)
    # return the list of matched phrases
    return matched_phrases




# test the function on the first row of the dataframe on message_text and ocr_results columns 
# add them to the dataframe as new columns called matched_terms_message_text and matched_terms_ocr_results

def id_jewish_institutions(row):

    """ 
    Add a column to the dataframe called jewish_institution_mentioned
    that is True if the message_text or ocr_results columns contain
    a term about jewish institutions.
    """

    # handle if message_text or ocr_results is NaN
    if pd.isna(row["message_text"]):
        row["message_text"] = ""
    if pd.isna(row["ocr_results"]):
        row["ocr_results"] = ""

    row["matched_terms_message_text"] = match_phrases(row["message_text"], phrases)
    row["matched_terms_ocr_results"] = match_phrases(row["ocr_results"], phrases)

    # add boolean column for whether jewish institution was mentioned in either message_text or ocr_results
    row["jewish_institution_mentioned"] = bool(row["matched_terms_message_text"] or row["matched_terms_ocr_results"])
    return row


In [None]:

df_processed = df_processed.apply(id_jewish_institutions, axis=1)
df_processed.sample(10)

In [None]:
# define a function to display processed telegram messages with OCR results, matched terms, weapons IDed and jewish institution mentioned
def display_processed_message(row):
    """
    Display a mock telegram message using image_attachment, message_text, date_time
    """

    # get weapon_instance_count
    weapon_instance_count = row["weapon_instance_count"]

    if weapon_instance_count > 0:
        # get the weapon drawn path
        image_attachment = Path(row["weapon_drawn_path"])
    else:
        # get the image attachment
        image_attachment = Path(row["image_attachment"])
    
    # get the message text
    message_text = row["message_text"]
    # get the date time
    date_time = row["date_time"]
    # get the OCR results
    ocr_results = row["ocr_results"]
    # get the matched terms in message text
    matched_terms_message_text = row["matched_terms_message_text"]
    # get if jewish institution was mentioned
    jewish_institution_mentioned = row["jewish_institution_mentioned"]
    # get the matched terms in ocr results
    matched_terms_ocr_results = row["matched_terms_ocr_results"]


    # print mock telegram message header
    print("=====================MOCK TELEGRAM MESSAGE=====================")
    # print the message id
    print("MOCK TELEGRAM MESSAGE ID:", row["message_id"])
    # display the message text
    print("MOCK TELEGRAM MESSAGE TEXT:", message_text)
    # display the date time
    print("MOCK TELEGRAM MESSAGE DATETIME:", date_time)
    # display the image attachment
    print("=====================DATA PROCESSING RESULTS=====================")
    # display the OCR results
    print("OCR RESULTS:", ocr_results)
    # display if weapon was drawn
    print("WEAPONS IDENTIFIED:", weapon_instance_count)
    # display if jewish institution was mentioned
    print("JEWISH INSTITUTION MENTIONED:", jewish_institution_mentioned)  \
    # display the matched terms in message text
    print("MATCHED TERMS IN MOCK MESSAGE TEXT:", matched_terms_message_text)
    # display matched terms in ocr results
    print("MATCHED TERMS IN OCR RESULTS:", matched_terms_ocr_results)
    image = Image.open(image_attachment)
    image.thumbnail((800, 800))
    display(image)

    # print a new line
    print("\n")   
    print("\n") 


# Display all processed results

In [None]:
# display all the messages 
for index, row in df_processed.iterrows():
    display_processed_message(row)

In [None]:
# export the dataframe to a csv file
df_processed.to_csv("../mock_telegram_messages_processed.csv", index=False)