In [1]:
from alarms.utils import AnalysisArea, stringify_messages, define_message_type, define_region, timedelta_to_hours
import json
import pandas as pd
from datetime import datetime

In [2]:
SELECTED_REGION = AnalysisArea.LVIV

In [3]:
# Load Lviv data into dataframe
if SELECTED_REGION == AnalysisArea.LVIV:
    ALARM_START_MARKERS = [r"(?:\n|.)*–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞(?:\n|.)*"]
    ALARM_END_MARKERS = [r"(?:\n|.)*–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏(?:\n|.)*"]
    filename = "-1001399934598-channel-messages.json"
# Load all regions data into dataframe
if SELECTED_REGION == AnalysisArea.ALL_UKRAINE:
    ALARM_START_MARKERS = [r"(?:\n|.)*(?:–ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞|–ó–∞–≥—Ä–æ–∑–∞ –∞—Ä—Ç–æ–±—Å—Ç—Ä—ñ–ª—É)(?: –≤|)(?:\n|.)*"]
    ALARM_END_MARKERS = [r"(?:\n|.)*–í—ñ–¥–±—ñ–π (?:(?:–ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó |)—Ç—Ä–∏–≤–æ–≥–∏|–∑–∞–≥—Ä–æ–∑–∏ –∞—Ä—Ç–æ–±—Å—Ç—Ä—ñ–ª—É)(?: –≤|)(?:\n|.)*"]
    filename = "-1766138888-channel-messages.json"
# Load data
with open(f"../exports/{filename}", "r") as f:  # NOQA
    raw_messages = json.loads(f.read())
raw_messages = pd.DataFrame(raw_messages["messages"]) if raw_messages.get("messages") else pd.DataFrame(
    raw_messages)
print(f"- Processing {len(raw_messages)} channel messages.")

- Processing 2365 channel messages.


In [4]:
# Convert raw messages into expected format
channel_messages = raw_messages[['date', 'text']].copy()
channel_messages.columns = ["datetime", "message"]
channel_messages["message"] = channel_messages["message"].apply(stringify_messages)
# Convert string into datetime
channel_messages["datetime"] = pd.to_datetime(channel_messages["datetime"], format="%Y-%m-%dT%H:%M:%S")
channel_messages.head()
# Skip dates < February 24th as alarms weren't working correctly during the first day of the war
channel_messages.drop(channel_messages[channel_messages["datetime"] < datetime(2022, 2, 25, 0, 0, 0, 0)].index, inplace=True)
# Skip dates > September 14th to keep data consistent
channel_messages.drop(channel_messages[channel_messages["datetime"] > datetime(2022, 9, 15, 0, 0, 0, 0)].index, inplace=True)
# Sort to get historical order of events
channel_messages = channel_messages.sort_values(by="datetime", ascending=True)
channel_messages

Unnamed: 0,datetime,message
1621,2022-02-25 06:53:08,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥...
1622,2022-02-25 07:56:00,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...
1623,2022-02-25 07:56:16,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...
1624,2022-02-25 12:17:40,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...
1625,2022-02-25 12:17:45,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...
...,...,...
2360,2022-09-11 20:44:41,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ...
2361,2022-09-14 16:37:19,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞!\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä...
2362,2022-09-14 16:54:08,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ...
2363,2022-09-14 16:59:10,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞!\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä...


In [5]:
# Mark info messages and actual alarms
channel_messages["message_type"] = channel_messages["message"].apply(  # NOQA
    define_message_type(ALARM_START_MARKERS, ALARM_END_MARKERS))  # NOQA
# # Look at alarm messages and copy the slice into separate datafame
alarm_messages_df = channel_messages[channel_messages["message_type"] != "info"].copy()
print(f"- Collected {len(alarm_messages_df)} alarm messages.")
alarm_messages_df

- Collected 689 alarm messages.


Unnamed: 0,datetime,message,message_type
1621,2022-02-25 06:53:08,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥...,start
1622,2022-02-25 07:56:00,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...,end
1623,2022-02-25 07:56:16,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...,end
1624,2022-02-25 12:17:40,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...,start
1625,2022-02-25 12:17:45,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...,start
...,...,...,...
2360,2022-09-11 20:44:41,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ...,end
2361,2022-09-14 16:37:19,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞!\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä...,start
2362,2022-09-14 16:54:08,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ...,end
2363,2022-09-14 16:59:10,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞!\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä...,start


In [6]:
# Look at info messages
info_messages_df = channel_messages[channel_messages["message_type"] == "info"]
print(f"- Collected {len(info_messages_df)} info messages.")
info_messages_df

- Collected 55 info messages.


Unnamed: 0,datetime,message,message_type
1628,2022-02-25 14:44:58,–í–∞–∂–ª–∏–≤–∞ —ñ–Ω—Ñ–æ—Ä–º–∞—Ü—ñ—è:\nhttps://www.facebook.com/...,info
1629,2022-02-25 18:41:11,–®–∞–Ω–æ–≤–Ω—ñ –º–µ—à–∫–∞–Ω—Ü—ñ –õ—å–≤—ñ–≤—â–∏–Ω–∏!\n–í–∂–µ –¥–≤–∞ –¥–Ω—ñ –º–∏ –∑–∞...,info
1630,2022-02-25 18:59:25,–ö–æ–º–µ–Ω–¥–∞–Ω—Ç—Å—å–∫–∞ –≥–æ–¥–∏–Ω–∞: —â–æ —ñ —è–∫:,info
1631,2022-02-25 18:59:36,,info
1632,2022-02-25 20:38:59,https://www.facebook.com/100064486998276/posts...,info
1633,2022-02-25 20:46:43,‚Äã‚Äã–Ü–Ω—Ñ–æ—Ä–º–∞—Ü—ñ—è –ø—Ä–æ —Å–∏—Å—Ç–µ–º–∏ –æ–ø–æ–≤—ñ—â–µ–Ω–Ω—è —â–æ–¥–æ –∫–æ–º–µ–Ω...,info
1638,2022-02-26 08:58:47,–õ—å–≤—ñ–≤‚Äô—è–Ω –ø–æ–ø–µ—Ä–µ–¥–∂–∞—é—Ç—å –ø—Ä–æ –∑–∞–±–æ—Ä–æ–Ω—É –ø–æ–ª—å–æ—Ç—ñ–≤ –ª—ñ...,info
1648,2022-02-27 11:11:15,–î–æ —É–≤–∞–≥–∏ –≥–æ–ª—ñ–≤ —Å—ñ–ª—å—Å—å–∫–∏—Ö —Ç–∞ —Å–µ–ª–∏—â–Ω–∏—Ö —Ç–µ—Ä–∏—Ç–æ—Ä—ñ–∞...,info
1649,2022-02-27 11:42:48,–§–æ—Ç–æ –≤—ñ–¥ *@@##‚Ç¥‚Ç¥*,info
1650,2022-02-27 11:43:06,–û—Ñ—ñ—Ü—ñ–π–Ω—ñ –∫–∞–Ω–∞–ª–∏ –∫–æ–º—É–Ω—ñ–∫–∞—Ü—ñ—ó –õ—å–≤—ñ–≤—Å—å–∫–æ—ó –æ–±–ª–∞—Å–Ω–æ...,info


In [7]:
# Define region per alarm
if SELECTED_REGION == AnalysisArea.LVIV:
    alarm_messages_df["region"] = "Lviv region"
if SELECTED_REGION == AnalysisArea.ALL_UKRAINE:
    alarm_messages_df["region"] = alarm_messages_df["message"].apply(define_region)
    # Drop test region
    alarm_messages_df.drop(alarm_messages_df[alarm_messages_df["region"] == "–¢–µ—Å—Ç–æ–≤–∏–π –†–µ–≥—ñ–æ–Ω"].index, inplace=True)
    # Focus only on regions, drop smaller areas (cities, districts, etc.)
    alarm_messages_df.drop(alarm_messages_df[~alarm_messages_df["region"].str.contains("region")].index, inplace=True)

In [8]:
if SELECTED_REGION == AnalysisArea.LVIV:
    # Filter all the duplicates, as Lviv alarms are usually (but not always) annouced 2-3 times in a row
    clean_alarm_messages = []
    for _, alarm_row in alarm_messages_df.iterrows():
        # The first entry is always a start
        if len(clean_alarm_messages) == 0:
            clean_alarm_messages.append(alarm_row.to_dict())
            continue
        # Skip duplicates (if the previous message have the same type)
        if clean_alarm_messages[-1]["message_type"] == alarm_row["message_type"]:
            continue
        clean_alarm_messages.append(alarm_row.to_dict())
if SELECTED_REGION == AnalysisArea.ALL_UKRAINE:
    # Filtering the first bugged message
    clean_alarm_messages = [alarm_row.to_dict() for _, alarm_row in alarm_messages_df.iterrows()][1:]
print(f"- Collected {len(clean_alarm_messages)} clean alarm messages.")  # NOQA

- Collected 390 clean alarm messages.


In [9]:
# Combine alarm messages into actual alarms
alarms = []
alarms_without_start = []
for alarm_message in clean_alarm_messages:
    # Create alarms from start messages
    if alarm_message["message_type"] == "start":
        alarms.append({"start_datetime": alarm_message["datetime"], "start_message": alarm_message["message"],
                       "region": alarm_message["region"]})
        continue
    # Find start message index, when iterating end messages
    smi = None
    for i in range(len(alarms) - 1, -1, -1):
        if alarms[i]["region"] == alarm_message["region"]:
            smi = i
            break
    # Skip alarms with no start message (bugs)
    if smi is None:
        alarms_without_start.append(alarm_message)
        continue
    # Extend the start message with end message
    alarms[smi]["end_datetime"] = alarm_message["datetime"]
    alarms[smi]["end_message"] = alarm_message["message"]
    alarms[smi]["timedelta"] = alarms[smi]["end_datetime"] - alarms[smi]["start_datetime"]
    alarms[smi]["duration_hours"] = timedelta_to_hours(alarms[smi]["timedelta"])
print(f"- Can't find start messages for {len(alarm_message)} alarms, skipping.")  # NOQA
all_alarms_df = pd.DataFrame(alarms)
all_alarms_df["day_of_the_year"] = all_alarms_df["start_datetime"].apply(lambda x: x.timetuple().tm_yday)
print(f"- Collected {len(all_alarms_df)} alarms.")
all_alarms_df.head(7)

- Can't find start messages for 4 alarms, skipping.
- Collected 195 alarms.


Unnamed: 0,start_datetime,start_message,region,end_datetime,end_message,timedelta,duration_hours,day_of_the_year
0,2022-02-25 06:53:08,–û–ø–æ–≤—ñ—â–µ–Ω–Ω—è –¶–ó:\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥...,Lviv region,2022-02-25 07:56:00,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...,0 days 01:02:52,1.047778,56
1,2022-02-25 12:17:40,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...,Lviv region,2022-02-25 13:00:34,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...,0 days 00:42:54,0.715,56
2,2022-02-26 06:15:28,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...,Lviv region,2022-02-26 07:11:56,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏!\n–£–≤–∞–≥–∞! –í—ñ–¥–±...,0 days 00:56:28,0.941111,57
3,2022-02-26 13:46:10,üö®–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞‚ùóÔ∏è\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ ...,Lviv region,2022-02-26 14:46:45,‚ùóÔ∏è–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏‚ùóÔ∏è\nüá¨üáßWarnin...,0 days 01:00:35,1.009722,57
4,2022-02-26 17:18:39,üö®–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞‚ùóÔ∏è\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ ...,Lviv region,2022-02-26 18:06:54,‚ùóÔ∏è–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏‚ùóÔ∏è\nüá¨üáßWarnin...,0 days 00:48:15,0.804167,57
5,2022-02-26 19:59:11,üö®–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞‚ùóÔ∏è\n–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ ...,Lviv region,2022-02-26 20:42:10,‚ùóÔ∏è–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏‚ùóÔ∏è\nüá¨üáßWarnin...,0 days 00:42:59,0.716389,57
6,2022-02-28 20:54:13,–£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏–≤–æ–≥–∞! –£–≤–∞–≥–∞! –ü–æ–≤—ñ—Ç—Ä—è–Ω–∞ —Ç—Ä–∏...,Lviv region,2022-02-28 21:23:49,–£–≤–∞–≥–∞! –í—ñ–¥–±—ñ–π –ø–æ–≤—ñ—Ç—Ä—è–Ω–æ—ó —Ç—Ä–∏–≤–æ–≥–∏! –£–≤–∞–≥–∞! –í—ñ–¥–±—ñ...,0 days 00:29:36,0.493333,59


In [10]:
# Save to file to create map
if SELECTED_REGION == AnalysisArea.ALL_UKRAINE:
    filename = "ukraine_alarms.json"
if SELECTED_REGION == AnalysisArea.LVIV:
    filename = "lviv_alarms.json"
with open(f"processed_data/{filename}", "w") as f:  # NOQA
    f.write(all_alarms_df.to_json())