In [1]:
import datetime
import numpy as np
import pandas as pd
import requests
import scipy
import scipy.sparse as sp
import json

import os
import pickle
import json

from bs4 import BeautifulSoup

from utils import weatherapi
from utils import isw_preprocessing
from utils import text_preprocessing

from sklearn import preprocessing


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dusty3ntity/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
MODELS_FOLDER = "data/models"
OUTPUT_FOLDER = "data/prediction"

if not os.path.exists(OUTPUT_FOLDER):
   os.makedirs(OUTPUT_FOLDER)

ISW_DATA_FILE = "data/isw/reports_processed_v2.csv"
WEATHER_EVENTS_DATA_FILE = "data/alarms/merged_events.csv"

tfidf_transformer_model = "tfidf_transformer"
tfidf_transformer_version = "v1"

count_vectorizer_model = "count_vectorizer"
count_vectorizer_version = "v1"

decision_tree_model = "DTC"
logistic_regression_model = "LR"
multi_layer_perceptron_model = "MLP"
random_forest_classifier_model = "RFC"
sgd_classifier_model = "SGD"

## 1. Load models & data

In [21]:
tfidf = pickle.load(open(f"data/isw/{tfidf_transformer_model}_{tfidf_transformer_version}.pkl", "rb"))
cv = pickle.load(open(f"data/isw/{count_vectorizer_model}_{count_vectorizer_version}.pkl", "rb"))

label_encoder = pickle.load(open(f"data/isw/hour_conditions_label_encoder.pkl", "rb"))

DTC = pickle.load(open(f"{MODELS_FOLDER}/{decision_tree_model}.pkl", "rb"))
LR = pickle.load(open(f"{MODELS_FOLDER}/{logistic_regression_model}.pkl", "rb"))
MLP = pickle.load(open(f"{MODELS_FOLDER}/{multi_layer_perceptron_model}.pkl", "rb"))
RFC = pickle.load(open(f"{MODELS_FOLDER}/{random_forest_classifier_model}.pkl", "rb"))
SGD = pickle.load(open(f"{MODELS_FOLDER}/{sgd_classifier_model}.pkl", "rb"))

In [5]:
REGIONS_DICTIONARY_FILE = "data/data_before_lab_3/regions.csv"
df_regions = pd.read_csv(REGIONS_DICTIONARY_FILE)
df_regions.head(25)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5
5,Житомирська,Житомир,Zhytomyr,Житомирщина,6
6,Закарпатська,Ужгород,Uzhgorod,Закарпаття,7
7,Запорізька,Запоріжжя,Zaporozhye,Запоріжжя,8
8,Івано-Франківська,Івано-Франківськ,Ivano-Frankivsk,Івано-Франківщина,9
9,Київська,Київ,Kyiv,Київщина,10


## 2. Setup prediction region

In [6]:
region = "Львівщина"
date = "2023-04-08"

city = df_regions[df_regions["region_alt"]==region]["center_city_en"].values[0]

## 3. Get weather

In [7]:
level = "hours"
location = f"{city},Ukraine"

file_name = f"weather___{city.lower()}__{date}.json"

In [8]:
if not os.path.isfile(f"{OUTPUT_FOLDER}/{file_name}"):
	city_weather_json = weatherapi.get_weather_json(location, date)
	json_object = json.dumps(city_weather_json, indent=4)

	with open(f"{OUTPUT_FOLDER}/{file_name}", "w") as outfile:
		outfile.write(json_object)
else:
	print(f"Weather data from the \nregion {region}; \ndate {date}; \n is ready")

Weather data from the 
region Львівщина; 
date 2023-04-08; 
 is ready


In [50]:
# weather_file_path = f"{OUTPUT_FOLDER}/{file_name}"
# weather_for_day_hourly = weather.get_weather_hourly_for_region(weather_file_path)
# weather_df = pd.DataFrame.from_dict(weather_for_day_hourly)

# Getting random weather from the dataset as the API is not ready yet
all_weather_by_hour = pd.read_csv("data/data_before_lab_3/all_weather_by_hour.csv")
weather_df = all_weather_by_hour.loc[(all_weather_by_hour['day_datetime'] == "2022-04-08") & (all_weather_by_hour['city_address'] == location)]

weather_df["day_datetime"] = pd.to_datetime(weather_df["day_datetime"])
weather_df["city"] = weather_df["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
weather_df["city"] = weather_df["city"].replace("Хмельницька область", "Хмельницький")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_df["day_datetime"] = pd.to_datetime(weather_df["day_datetime"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_df["city"] = weather_df["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_df["city"] = weather_df["city"].repl

In [51]:
print(weather_df.shape)
weather_df.head(5)

(24, 68)


Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_feelslikemax,day_feelslikemin,day_feelslike,day_dew,day_humidity,day_precip,day_precipprob,day_precipcover,day_snow,day_snowdepth,day_windgust,day_windspeed,day_winddir,day_pressure,day_cloudcover,day_visibility,day_solarradiation,day_solarenergy,day_uvindex,day_severerisk,day_sunrise,day_sunriseEpoch,day_sunset,day_sunsetEpoch,day_moonphase,day_conditions,day_description,day_icon,day_source,day_preciptype,day_stations,hour_datetime,hour_datetimeEpoch,hour_temp,hour_feelslike,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_severerisk,hour_conditions,hour_icon,hour_source,hour_stations,city
151967,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,2.0,2022-04-08,1649365200,13.7,6.4,10.2,13.7,1.4,9.2,1.6,58.0,1.6,100.0,8.33,0.0,0.0,59.4,36.0,252.9,997.5,73.5,18.0,222.9,19.4,7.0,10.0,06:46:11,1649389571,20:06:24,1649437584,0.22,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,obs,rain,33393099999;remote,00:00:00,1649365200,9.6,7.4,67.6,3.9,0.0,0.0,0.0,0.0,,27.4,14.4,200.0,995.6,10.0,80.0,0.0,,0.0,10.0,Partially cloudy,partly-cloudy-night,obs,33393099999,Львів
151968,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,2.0,2022-04-08,1649365200,13.7,6.4,10.2,13.7,1.4,9.2,1.6,58.0,1.6,100.0,8.33,0.0,0.0,59.4,36.0,252.9,997.5,73.5,18.0,222.9,19.4,7.0,10.0,06:46:11,1649389571,20:06:24,1649437584,0.22,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,obs,rain,33393099999;remote,01:00:00,1649368800,8.1,6.0,75.34,4.0,0.0,0.0,0.0,0.0,,27.4,11.9,205.6,996.0,24.1,20.4,0.0,,0.0,10.0,Partially cloudy,partly-cloudy-night,obs,remote,Львів
151969,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,2.0,2022-04-08,1649365200,13.7,6.4,10.2,13.7,1.4,9.2,1.6,58.0,1.6,100.0,8.33,0.0,0.0,59.4,36.0,252.9,997.5,73.5,18.0,222.9,19.4,7.0,10.0,06:46:11,1649389571,20:06:24,1649437584,0.22,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,obs,rain,33393099999;remote,02:00:00,1649372400,7.5,5.3,77.93,3.9,0.0,0.0,0.0,0.0,,26.6,11.9,210.6,996.0,24.1,100.0,0.0,,0.0,10.0,Overcast,cloudy,obs,remote,Львів
151970,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,2.0,2022-04-08,1649365200,13.7,6.4,10.2,13.7,1.4,9.2,1.6,58.0,1.6,100.0,8.33,0.0,0.0,59.4,36.0,252.9,997.5,73.5,18.0,222.9,19.4,7.0,10.0,06:46:11,1649389571,20:06:24,1649437584,0.22,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,obs,rain,33393099999;remote,03:00:00,1649376000,8.9,7.1,71.87,4.1,0.0,0.0,0.0,0.0,,32.0,10.8,190.0,994.6,10.0,60.0,0.0,,0.0,10.0,Partially cloudy,partly-cloudy-night,obs,33393099999,Львів
151971,49.8444,24.0254,"Львів, Україна","Lviv,Ukraine",Europe/Kiev,2.0,2022-04-08,1649365200,13.7,6.4,10.2,13.7,1.4,9.2,1.6,58.0,1.6,100.0,8.33,0.0,0.0,59.4,36.0,252.9,997.5,73.5,18.0,222.9,19.4,7.0,10.0,06:46:11,1649389571,20:06:24,1649437584,0.22,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,obs,rain,33393099999;remote,04:00:00,1649379600,8.5,5.7,74.88,4.3,0.0,0.0,0.0,0.0,,38.2,17.3,210.0,994.0,24.1,100.0,0.0,,0.0,10.0,Overcast,cloudy,obs,remote,Львів


In [52]:
weather_df_v2 = pd.merge(weather_df, df_regions, left_on="city", right_on="center_city_ua")

## 3. Get ISW report

In [53]:
date = "2023-04-08"

BASE_URL = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment"
months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
date_arr = date.split("-")

url = f"{BASE_URL}-{months[int(date_arr[1])-1]}-{int(date_arr[2])}-{int(date_arr[0])}"
print(url)

page = requests.get(url)
isw_report_html_file = f"{OUTPUT_FOLDER}/isw___{date}.html"
with open(isw_report_html_file, "wb+") as f:
  f.write(page.content)

https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-april-8-2023


In [54]:
with open(isw_report_html_file, "r", encoding="utf8") as cfile:
		html = BeautifulSoup(cfile.read(), features="html.parser")
		content_html = str(html.body.find("div", attrs={"class": "field-type-text-with-summary"}))

		content_html = isw_preprocessing.preprocess_page_html(content_html)
		content_text_lemm = text_preprocessing.text_preprocess(content_html, "lemm")

In [55]:
word_count_vector = cv.transform([content_text_lemm])

In [56]:
word_count_vector

<1x6524 sparse matrix of type '<class 'numpy.int64'>'
	with 945 stored elements in Compressed Sparse Row format>

In [57]:
tfidf_vector = tfidf.transform(word_count_vector)

In [58]:
tfidf_vector

<1x6524 sparse matrix of type '<class 'numpy.float64'>'
	with 945 stored elements in Compressed Sparse Row format>

## 4. Data preparation

In [59]:
weather_exclude = [
"day_feelslikemax",
"day_feelslikemin",
"day_sunriseEpoch",
"day_sunsetEpoch",
"day_description",
"city_latitude",
"city_longitude",
"city_address",
"city_timezone",
"city_tzoffset",
"day_feelslike",
"day_precipprob",
"day_snow",
"day_snowdepth",
"day_windgust",
"day_windspeed",
"day_winddir",
"day_pressure",
"day_cloudcover",
"day_visibility",
"day_severerisk",
"day_conditions",
"day_icon",
"day_source",
"day_preciptype",
"day_stations",
"hour_icon",
"hour_source",
"hour_stations",
"hour_feelslike"
]

fields_to_exclude = [
	"city_resolvedAddress",
	"day_datetime",
	"day_datetimeEpoch",
	"hour_datetime",
	"hour_datetimeEpoch",
	"city",
	"region",
	"center_city_ua",
	"center_city_en"
]

tmp_fields_to_exclude = [
	"day_sunrise",
	"day_sunset",
	"hour_preciptype",
	"hour_conditions",
	"hour_solarenergy",
	"region_alt"
]

df_work_v2 = weather_df_v2.drop(weather_exclude, axis=1)
df_work_v2 = df_work_v2.drop(fields_to_exclude, axis=1)

In [60]:
df_work_v2["hour_conditions"] = df_work_v2["hour_conditions"].apply(lambda x: x.split(",")[0])
df_work_v2["hour_conditions_id"] = label_encoder.transform(df_work_v2["hour_conditions"])

In [61]:
df_work_v3 = df_work_v2.drop(tmp_fields_to_exclude, axis=1)

In [62]:
df_work_v3.head(10)

Unnamed: 0,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,day_uvindex,day_moonphase,hour_temp,hour_humidity,hour_dew,hour_precip,hour_precipprob,hour_snow,hour_snowdepth,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_uvindex,hour_severerisk,region_id,hour_conditions_id
0,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,9.6,67.6,3.9,0.0,0.0,0.0,0.0,27.4,14.4,200.0,995.6,10.0,80.0,0.0,0.0,10.0,13,4
1,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,8.1,75.34,4.0,0.0,0.0,0.0,0.0,27.4,11.9,205.6,996.0,24.1,20.4,0.0,0.0,10.0,13,4
2,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,7.5,77.93,3.9,0.0,0.0,0.0,0.0,26.6,11.9,210.6,996.0,24.1,100.0,0.0,0.0,10.0,13,3
3,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,8.9,71.87,4.1,0.0,0.0,0.0,0.0,32.0,10.8,190.0,994.6,10.0,60.0,0.0,0.0,10.0,13,4
4,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,8.5,74.88,4.3,0.0,0.0,0.0,0.0,38.2,17.3,210.0,994.0,24.1,100.0,0.0,0.0,10.0,13,3
5,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,8.6,74.9,4.4,0.0,0.0,0.0,0.0,38.5,14.4,215.8,994.0,10.1,100.0,0.0,0.0,10.0,13,3
6,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,10.1,69.14,4.7,0.0,0.0,0.0,0.0,32.0,18.0,220.0,993.8,10.0,80.0,0.0,0.0,10.0,13,4
7,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,8.4,81.97,5.5,0.0,0.0,0.0,0.0,37.4,16.9,223.1,995.0,8.5,100.0,0.0,0.0,10.0,13,3
8,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,9.0,77.62,5.3,0.0,0.0,0.0,0.0,43.2,18.4,223.2,995.0,24.1,100.0,31.0,0.0,10.0,13,3
9,13.7,6.4,10.2,1.6,58.0,1.6,8.33,222.9,19.4,7.0,0.22,6.4,90.13,4.9,1.0,100.0,0.0,0.0,43.2,36.0,300.0,997.8,5.0,100.0,112.0,1.0,10.0,13,5


In [64]:
tfidf_matrix = tfidf_vector

for i in range(0, 23):
	tfidf_matrix = sp.vstack((tfidf_matrix, tfidf_vector), format="csr")

In [65]:
tfidf_vector

<1x6524 sparse matrix of type '<class 'numpy.float64'>'
	with 945 stored elements in Compressed Sparse Row format>

In [66]:
tfidf_matrix

<24x6524 sparse matrix of type '<class 'numpy.float64'>'
	with 22680 stored elements in Compressed Sparse Row format>

In [67]:
df_work_v4_csr = sp.csr_matrix(df_work_v3.values)
df_work_v4_csr

<24x29 sparse matrix of type '<class 'numpy.float64'>'
	with 580 stored elements in Compressed Sparse Row format>

In [68]:
df_all_features = sp.hstack((df_work_v4_csr, tfidf_matrix), format="csr")

In [69]:
hours_alarm_schedule = DTC.predict(df_all_features)

In [70]:
hours_alarm_schedule

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0])