In [1]:
import pandas as pd
import requests
import scipy.sparse as sp
import json
import os
import pickle

from bs4 import BeautifulSoup

from utils import weatherapi
from utils import isw_preprocessing
from utils import text_preprocessing


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
MODELS_FOLDER = "data/models"
OUTPUT_FOLDER = "data/prediction"

if not os.path.exists(OUTPUT_FOLDER):
   os.makedirs(OUTPUT_FOLDER)

tfidf_transformer_model = "tfidf_transformer"
tfidf_transformer_version = "v1"

count_vectorizer_model = "count_vectorizer"
count_vectorizer_version = "v1"

logistic_regression_model = "LR"
multi_layer_perceptron_model = "MLP"
random_forest_classifier_model = "RFC"

## 1. Load models & data

In [4]:
tfidf = pickle.load(open(f"data/isw/{tfidf_transformer_model}_{tfidf_transformer_version}.pkl", "rb"))
cv = pickle.load(open(f"data/isw/{count_vectorizer_model}_{count_vectorizer_version}.pkl", "rb"))

label_encoder = pickle.load(open(f"data/isw/hour_conditions_label_encoder.pkl", "rb"))

LR = pickle.load(open(f"{MODELS_FOLDER}/{logistic_regression_model}.pkl", "rb"))
MLP = pickle.load(open(f"{MODELS_FOLDER}/{multi_layer_perceptron_model}.pkl", "rb"))
RFC = pickle.load(open(f"{MODELS_FOLDER}/{random_forest_classifier_model}.pkl", "rb"))

In [5]:
REGIONS_DICTIONARY_FILE = "data/data_before_lab_3/regions.csv"
df_regions = pd.read_csv(REGIONS_DICTIONARY_FILE)
df_regions.head(25)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5
5,Житомирська,Житомир,Zhytomyr,Житомирщина,6
6,Закарпатська,Ужгород,Uzhgorod,Закарпаття,7
7,Запорізька,Запоріжжя,Zaporozhye,Запоріжжя,8
8,Івано-Франківська,Івано-Франківськ,Ivano-Frankivsk,Івано-Франківщина,9
9,Київська,Київ,Kyiv,Київщина,10


## 2. Setup prediction region

In [6]:
region = "Львівщина"
date = "2023-04-22"

city = df_regions[df_regions["region_alt"]==region]["center_city_en"].values[0]

## 3. Get weather

In [7]:
location = f"{city},Ukraine"

weather_forecast_dir = f"{OUTPUT_FOLDER}/weather/{city.lower()}"
weather_forecast_file_name = f"weather___{city.lower()}__{date}.json"

In [8]:
if not os.path.isfile(f"{weather_forecast_dir}/{weather_forecast_file_name}"):
	city_weather_json = weatherapi.get_hourly_weather_json(location, date)
	json_object = json.dumps(city_weather_json, indent=4)

	if not os.path.exists(weather_forecast_dir):
		os.makedirs(weather_forecast_dir)
	
	with open(f"{weather_forecast_dir}/{weather_forecast_file_name}", "w") as outfile:
		outfile.write(json_object)
else:
	print(f"Weather data for {region} and {date} is ready")

https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/Lviv,Ukraine/2023-04-22?unitGroup=metric


In [9]:
weather_for_day_hourly = json.load(open(f"{weather_forecast_dir}/{weather_forecast_file_name}", "rb"))
weather_df = pd.DataFrame.from_dict(weather_for_day_hourly)

weather_df["day_datetime"] = pd.to_datetime(weather_df["day_datetime"])
weather_df = weather_df.fillna(-10)
weather_df["city"] = weather_df["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
weather_df["city"] = weather_df["city"].replace("Хмельницька область", "Хмельницький")

In [10]:
print(weather_df.shape)
weather_df.head(5)

(24, 68)


Unnamed: 0,hour_severerisk,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_severerisk,day_datetime,day_datetimeEpoch,...,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,hour_icon,hour_stations,hour_source,city
0,10,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,3.0,10,2023-04-22,1682110800,...,-10,91.6,-10,-10,-10,Overcast,cloudy,[remote],obs,Львів
1,10,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,3.0,10,2023-04-22,1682110800,...,-10,68.0,-10,-10,-10,Partially cloudy,partly-cloudy-night,[remote],obs,Львів
2,10,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,3.0,10,2023-04-22,1682110800,...,-10,64.0,-10,-10,-10,Partially cloudy,partly-cloudy-night,[remote],obs,Львів
3,10,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,3.0,10,2023-04-22,1682110800,...,-10,34.7,-10,-10,-10,Partially cloudy,partly-cloudy-night,[remote],obs,Львів
4,10,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,3.0,10,2023-04-22,1682110800,...,-10,0.0,-10,-10,-10,Clear,clear-night,[remote],obs,Львів


In [11]:
weather_df_v2 = pd.merge(weather_df, df_regions, left_on="city", right_on="center_city_ua")

## 3. Get ISW report

In [12]:
isw_report_html_dir = f"{OUTPUT_FOLDER}/isw"
isw_report_html_file_name = f"isw___{date}.html"

In [13]:
if not os.path.isfile(f"{isw_report_html_dir}/{isw_report_html_file_name}"):
	BASE_URL = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment"
	months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
	date_arr = date.split("-")

	url = f"{BASE_URL}-{months[int(date_arr[1])-1]}-{int(date_arr[2])}-{int(date_arr[0])}"
	print(url)

	if not os.path.exists(isw_report_html_dir):
		os.makedirs(isw_report_html_dir)

	page = requests.get(url)
	with open(f"{isw_report_html_dir}/{isw_report_html_file_name}", "wb+") as f:
		f.write(page.content)
else:
	print(f"ISW report for {date} is ready")

ISW report for 2023-04-22 is ready


In [14]:
with open(f"{isw_report_html_dir}/{isw_report_html_file_name}", "r", encoding="utf8") as cfile:
	html = BeautifulSoup(cfile.read(), features="html.parser")
	content_html = str(html.body.find("div", attrs={"class": "field-type-text-with-summary"}))

	content_html = isw_preprocessing.preprocess_page_html(content_html)
	content_text_lemm = text_preprocessing.text_preprocess(content_html, "lemm")

In [15]:
word_count_vector = cv.transform([content_text_lemm])
word_count_vector

<1x6524 sparse matrix of type '<class 'numpy.int64'>'
	with 887 stored elements in Compressed Sparse Row format>

In [16]:
tfidf_vector = tfidf.transform(word_count_vector)
tfidf_vector

<1x6524 sparse matrix of type '<class 'numpy.float64'>'
	with 887 stored elements in Compressed Sparse Row format>

## 4. Data preparation

In [17]:
weather_exclude = [
	"day_feelslikemax",
	"day_feelslikemin",
	"day_sunriseEpoch",
	"day_sunsetEpoch",
	"day_description",
	"city_latitude",
	"city_longitude",
	"city_address",
	"city_timezone",
	"city_tzoffset",
	"day_feelslike",
	"day_precipprob",
	"day_snow",
	"day_snowdepth",
	"day_windgust",
	"day_windspeed",
	"day_winddir",
	"day_pressure",
	"day_cloudcover",
	"day_visibility",
	"day_severerisk",
	"day_conditions",
	"day_icon",
	"day_source",
	"day_preciptype",
	"day_stations",
	"hour_icon",
	"hour_source",
	"hour_stations",
	"hour_feelslike"
]

fields_to_exclude = [
	"city_resolvedAddress",
	"day_datetime",
	"day_datetimeEpoch",
	"hour_datetime",
	"hour_datetimeEpoch",
	"city",
	"region",
	"center_city_ua",
	"center_city_en"
]

tmp_fields_to_exclude = [
	"day_sunrise",
	"day_sunset",
	"hour_preciptype",
	"hour_conditions",
	"hour_solarenergy",
	"region_alt"
]

df_work_v2 = weather_df_v2.drop(weather_exclude, axis=1)
df_work_v2 = df_work_v2.drop(fields_to_exclude, axis=1)

In [18]:
df_work_v2["hour_conditions"] = df_work_v2["hour_conditions"].apply(lambda x: x.split(",")[0])
df_work_v2["hour_conditions_id"] = label_encoder.transform(df_work_v2["hour_conditions"])

In [19]:
df_work_v3 = df_work_v2.drop(tmp_fields_to_exclude, axis=1)
df_work_v3.head(10)

Unnamed: 0,hour_severerisk,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_uvindex,region_id,hour_conditions_id
0,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,9.0,4.0,354.3,1015.0,-10,91.6,-10,-10,13,3
1,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,8.6,4.0,357.6,1015.0,-10,68.0,-10,-10,13,4
2,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,7.9,3.2,1.5,1015.0,-10,64.0,-10,-10,13,4
3,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,4.0,2.2,357.9,1015.0,-10,34.7,-10,-10,13,4
4,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,4.3,2.2,357.4,1015.0,-10,0.0,-10,-10,13,0
5,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,3.6,2.5,357.4,1015.0,-10,3.5,-10,-10,13,0
6,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,3.6,2.5,358.9,1015.0,-10,33.7,-10,-10,13,4
7,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,3.6,1.8,308.2,1015.0,-10,59.6,-10,-10,13,4
8,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,5.0,2.9,284.0,1016.0,-10,55.3,-10,-10,13,4
9,10,16.8,7.5,12.2,0.9,47.1,0.0,0.0,-10,-10,...,7.6,2.5,273.7,1016.0,-10,48.6,-10,-10,13,4


In [20]:
tfidf_matrix = tfidf_vector

for i in range(0, 23):
	tfidf_matrix = sp.vstack((tfidf_matrix, tfidf_vector), format="csr")

tfidf_matrix

<24x6524 sparse matrix of type '<class 'numpy.float64'>'
	with 21288 stored elements in Compressed Sparse Row format>

In [21]:
df_work_v4_csr = sp.csr_matrix(df_work_v3.values)
df_work_v4_csr

<24x29 sparse matrix of type '<class 'numpy.float64'>'
	with 547 stored elements in Compressed Sparse Row format>

In [22]:
df_all_features = sp.hstack((df_work_v4_csr, tfidf_matrix), format="csr")

In [23]:
hours_alarm_schedule = MLP.predict(df_all_features)
hours_alarm_schedule

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [24]:
schedule = []
for i in range(0, 24):
	schedule.append({ f"{i}:00": "false" if hours_alarm_schedule[i] == 0 else "true" })

print(f"Alarms in {location} on {date}:")
schedule

Alarms in Lviv,Ukraine on 2023-04-22:


[{'0:00': 'false'},
 {'1:00': 'false'},
 {'2:00': 'false'},
 {'3:00': 'false'},
 {'4:00': 'false'},
 {'5:00': 'false'},
 {'6:00': 'false'},
 {'7:00': 'false'},
 {'8:00': 'false'},
 {'9:00': 'false'},
 {'10:00': 'false'},
 {'11:00': 'false'},
 {'12:00': 'false'},
 {'13:00': 'false'},
 {'14:00': 'false'},
 {'15:00': 'false'},
 {'16:00': 'false'},
 {'17:00': 'false'},
 {'18:00': 'false'},
 {'19:00': 'false'},
 {'20:00': 'false'},
 {'21:00': 'false'},
 {'22:00': 'false'},
 {'23:00': 'false'}]