Lab2

In [117]:
import pandas as pd
import urllib.request
import datetime
import os
import shutil
from IPython.display import display

Initializing dictionary with old and new province indexes

In [114]:
province_index_dict = {
    1: (22, 'Черкаська'), 2: (24, 'Чернігівська'), 3: (23, 'Чернівецька'), 4: (25, 'Крим'), 5: (3, 'Дніпропетровська'), 
    6: (4, 'Донецька'), 7: (8, 'Івано-Франківська'), 8: (19 , 'Харківська'), 9: (20, 'Херсонська'), 10: (21, 'Хмельницька'),
    11: (9, 'Київська'), 13: (10, 'Кіровоградська'), 14: (11, 'Луганська'), 15: (12, 'Львівська'), 16: (13, 'Миколаївська'), 
    17: (14, 'Одеська'), 18: (15, 'Полтавська'), 19: (16, 'Рівненська'), 21: (17, 'Сумська'), 22: (18, 'Тернопільська'),
    23: (6, 'Закарпатська'), 24: (1, 'Вінницька'), 25: (2, 'Волинська'), 26: (7, 'Запорізька'), 27: (5, 'Житомирська')
    }

In [118]:
def get_url(i: int) -> str:
    return f'https://www.star.nesdis.noaa.gov/smcd/emb/vci/VH/get_TS_admin.php?country=UKR&provinceID={i}&year1=1981&year2=2024&type=Mean'

def file_create(i: int, data: bytes, csv_path: str) -> None:
    date_time = datetime.datetime.now().strftime("%d%m%Y%H%M%S")
    out = open(f'{csv_path}/vhi_id_{i}_{date_time}.csv', "wb")
    out.write(data)


In [119]:
headers = ['Year', 'Week', 'SMN', 'SMT', 'VCI', 'TCI', 'VHI', 'empty']
dfs = []

In [120]:
def dataframe_create(path: str) -> None:    
    for file in (os.listdir(path)):
        i = file.split(sep='_')[2]
        df = pd.read_csv(f'{path}/{file}', header = 1, names = headers)
        normalize_dataframe(df, i)


def normalize_dataframe(df: pd.DataFrame, i: int) -> None:
    # dropping NaN values
    df = df.dropna(subset=['VHI']).drop(columns=['empty'])

    # normalizing data
    df["Year"] = df["Year"].str.extract(r"(\d+)").astype(int)
    df['Week'] = df['Week'].astype(int)

    #Adding ids for all provinces
    df.insert(0, 'ProvinceID', [int(i)] * df.shape[0])
    
    dfs.append(df)


In [121]:
def add_province_names(general_df: pd.DataFrame) -> None:
    for key, value in province_index_dict.items():
        general_df.loc[general_df['ProvinceID'] == key, 'ProvinceName'] = value[1]
    province_column = general_df.pop('ProvinceName')
    general_df.insert(1, 'ProvinceName', province_column)

In [122]:
def change_ProvinceID(general_df: pd.DataFrame) -> None:
    replace_dict = {key: value[0] for key, value in province_index_dict.items()}
    general_df['ProvinceID'] = general_df['ProvinceID'].replace(replace_dict)


In [124]:
if os.path.isdir(csv_path:="csv_files"):
    shutil.rmtree(csv_path)

os.mkdir(csv_path)

#Creating files
for i in range(1, 28):
    if i == 12 or i == 20:
        continue
    else:
        url = get_url(i)
        data = urllib.request.urlopen(url).read()
        file_create(i, data, csv_path)
print("Files with data created\n")

#Creating separate dataframes for each province
dataframe_create(csv_path)

#Concatenating all dataframes into one general dataframe 
general_df = pd.concat(dfs, ignore_index=True)

#Adding provinces' names
add_province_names(general_df)

#Changing provinces' IDs
change_ProvinceID(general_df)

print('General normalized dataframe:')
display(general_df)



Files with data created

General normalized dataframe:


Unnamed: 0,ProvinceID,ProvinceName,Year,Week,SMN,SMT,VCI,TCI,VHI
0,21,Хмельницька,1982,1,0.059,258.24,51.11,48.78,49.95
1,21,Хмельницька,1982,2,0.063,261.53,55.89,38.20,47.04
2,21,Хмельницька,1982,3,0.063,263.45,57.30,32.69,44.99
3,21,Хмельницька,1982,4,0.061,265.10,53.96,28.62,41.29
4,21,Хмельницька,1982,5,0.058,266.42,46.87,28.57,37.72
...,...,...,...,...,...,...,...,...,...
111795,20,Херсонська,2024,48,0.135,278.17,55.23,11.86,33.55
111796,20,Херсонська,2024,49,0.133,277.08,57.71,10.86,34.29
111797,20,Херсонська,2024,50,0.130,276.49,59.45,8.68,34.07
111798,20,Херсонська,2024,51,0.128,276.45,62.53,5.55,34.04
