In [1]:
import pandas as pd
import os

In [109]:
!pip install --upgrade certifi

You should consider upgrading via the '/Users/krzysztofpraca/Projects/hackathon-2023/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import pandas as pd
from ast import literal_eval

In [4]:
def parse_df(df, col):
    df[col] = df[col].apply(lambda x: literal_eval(x))
    
    # Convert the date_range column to string (if it's not already a string)
    df['date_range'] = df['date_range'].astype(str)

    # Extract the start date from the date_range column
    df['start_date'] = df['date_range'].str.extract(r"\'(.*?)\'", expand=False)

    # Extract the year from the start date
    df['year'] = df['start_date'].str.split('-', expand=True)[0]
    
    median_maximum = []
    median_mean = []
    median_minimum = []
    median_stddev = []
    median_valid_percent = []
    href = []

    for idx, row in df.iterrows():
        medians = calculate_median_values(parse_data_to_dict_list(row[col]))
        
        median_maximum.append(medians['median_maximum'])
        median_mean.append(medians['median_mean'])
        median_minimum.append(medians['median_minimum'])
        median_stddev.append(medians['median_stddev'])
        href.append(medians['href'])
        median_valid_percent.append(medians['median_valid_percent'])
        
    data_df = {
        f'median_maximum_{col}': median_maximum,
        f'median_mean_{col}': median_mean,
        f'median_minimum_{col}': median_minimum,
        f'median_stddev_{col}': median_stddev,
        f'median_valid_percent_{col}': median_valid_percent,
        f'href_{col}': href
    }
    
    medians_df = pd.DataFrame(data_df)
            
    return df.merge(medians_df, left_index=True, right_index=True)
    

In [5]:
def parse_data_to_dict_list(data):
    # Create an empty list to store the data dictionaries
    data_dicts = []
    
    # Loop through each item in the data
    for item in data:
        # Create an empty dictionary to store the data for this item
        data_dict = {}
        # Extract and store the data in the dictionary
        data_dict['href'] = item.get('href', None)
        # Accessing nested dictionary values for raster:bands and statistics
        band_stats = item.get('raster:bands', [{}])[0].get('statistics', {})
        # Create a stats dictionary
        stats_dict = {
            'maximum': band_stats.get('maximum', None),
            'mean': band_stats.get('mean', None),
            'minimum': band_stats.get('minimum', None),
            'stddev': band_stats.get('stddev', None),
            'valid_percent': band_stats.get('valid_percent', None)
        }
        # Store the stats dictionary in the data dictionary
        data_dict['stats'] = stats_dict
        
        # Append the data dictionary to the data_dicts list
        data_dicts.append(data_dict)
    
    return data_dicts

In [6]:
import numpy as np

def calculate_median_values(parsed_data):
    # Function to find the index of the median value
    def find_median_index(values):
        sorted_indices = np.argsort(values)
        median_index = sorted_indices[len(sorted_indices) // 2]
        return median_index

    # Lists to store the statistical values
    max_values = []
    mean_values = []
    min_values = []
    stddev_values = []
    valid_percent_values = []

    # Iterate through the parsed data and extract the statistical values
    for data_dict in parsed_data:
        stats_dict = data_dict.get('stats', {})
        max_values.append(stats_dict.get('maximum', np.nan))
        mean_values.append(stats_dict.get('mean', np.nan))
        min_values.append(stats_dict.get('minimum', np.nan))
        stddev_values.append(stats_dict.get('stddev', np.nan))
        valid_percent_values.append(stats_dict.get('valid_percent', np.nan))

    # Find the median indices
    median_indices = {
        'median_maximum_index': find_median_index(max_values),
        'median_mean_index': find_median_index(mean_values),
        'median_minimum_index': find_median_index(min_values),
        'median_stddev_index': find_median_index(stddev_values),
        'median_valid_percent_index': find_median_index(valid_percent_values)
    }
    
    # Extract the hrefs using the median indices
    median_hrefs = {
        'median_mean_href': parsed_data[median_indices['median_mean_index']].get('href', None)
    }
    
    medians = {
        'median_maximum': np.nanmedian(max_values),
        'median_mean': np.nanmedian(mean_values),
        'median_minimum': np.nanmedian(min_values),
        'median_stddev': np.nanmedian(stddev_values),
        'median_valid_percent': np.nanmedian(valid_percent_values),
        'href': median_hrefs['median_mean_href']
    }

    return medians


In [7]:
data_path = './ETHIOPIA/'
files = [x for x in os.listdir(data_path) if x.endswith('.csv')]

In [8]:
df_ethiopia_raw = pd.concat([
    pd.read_csv(os.path.join(data_path, x)) for x in files
],
         ignore_index=True)

In [9]:
df_ethiopia_ndvi = parse_df(df_ethiopia_raw.copy(), 'job_res_ndvi').drop(columns=['city']).rename(columns={'country': 'city'})
df_ethiopia_ndwi = parse_df(df_ethiopia_raw.copy(), 'job_res_ndwi').drop(columns=['city']).rename(columns={'country': 'city'})

In [10]:
df_ethiopia_ndvi

Unnamed: 0,city,date_range,job_res_ndvi,job_res_ndwi,start_date,year,median_maximum_job_res_ndvi,median_mean_job_res_ndvi,median_minimum_job_res_ndvi,median_stddev_job_res_ndvi,median_valid_percent_job_res_ndvi,href_job_res_ndvi
0,Bahir Dar,"('2023-05-01', '2023-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2023-05-01,2023,0.784514,0.055448,-0.530016,0.254600,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
1,Shashemenē,"('2023-05-01', '2023-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2023-05-01,2023,1.068360,0.390046,-0.252107,0.194476,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
2,Desē,"('2023-05-01', '2023-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2023-05-01,2023,0.863018,0.410948,-0.419755,0.203515,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
3,Sodo,"('2023-05-01', '2023-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2023-05-01,2023,0.848921,0.327761,-0.198482,0.236665,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
4,Bahir Dar,"('2022-05-01', '2022-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2022-05-01,2022,0.745772,0.060608,-0.441888,0.259088,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
...,...,...,...,...,...,...,...,...,...,...,...,...
103,Ārba Minch’,"('2018-05-01', '2018-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2018-05-01,2018,0.832880,0.421314,-0.414341,0.340752,85.980,https://openeo.dataspace.copernicus.eu/openeo/...
104,Jīma,"('2018-05-01', '2018-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2018-05-01,2018,0.799016,0.473989,-0.091084,0.153206,100.000,https://openeo.dataspace.copernicus.eu/openeo/...
105,Hosa’ina,"('2018-05-01', '2018-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2018-05-01,2018,0.817416,0.415148,-0.309349,0.171986,82.515,https://openeo.dataspace.copernicus.eu/openeo/...
106,Harar,"('2018-05-01', '2018-05-31')","[{'file:nodata': ['nan'], 'href': 'https://ope...","[{'file:nodata': ['nan'], 'href': 'https://ope...",2018-05-01,2018,0.789636,0.403248,-0.216981,0.132127,100.000,https://openeo.dataspace.copernicus.eu/openeo/...


In [11]:
df_temp = pd.read_csv('temperature.csv')

In [12]:
df_temp

Unnamed: 0,city,date_start,date_end,mean_temp,std_tmp,min_temp,max_tmp
0,Addis Ababa,2023-05-01,2023-05-31,15.890095,3.445394,8.795000,22.395000
1,Addis Ababa,2022-05-01,2022-05-31,18.702930,5.025024,8.945000,27.845000
2,Addis Ababa,2021-05-01,2021-05-31,16.315498,4.184810,8.645000,25.945000
3,Addis Ababa,2020-05-01,2020-05-31,16.553938,3.662434,8.645000,24.195000
4,Addis Ababa,2019-05-01,2019-05-31,17.719732,4.375379,9.795000,27.045000
...,...,...,...,...,...,...,...
175,Debre Birhan,2019-05-01,2019-05-31,15.902252,3.502083,10.191500,23.141500
176,Debre Birhan,2018-05-01,2018-05-31,15.404535,3.309012,9.141500,21.891500
177,Debre Birhan,2017-05-01,2017-05-31,15.033569,2.933514,8.991500,21.641500
178,Debre Birhan,2016-05-01,2016-05-31,15.397413,3.291837,9.591499,22.491499


In [13]:
cities = pd.read_csv('worldcities.csv')

In [14]:
cities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.6100,77.2300,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.1300,113.2600,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629
...,...,...,...,...,...,...,...,...,...,...,...
44686,Numto,Numto,63.6667,71.3333,Russia,RU,RUS,Khanty-Mansiyskiy Avtonomnyy Okrug-Yugra,,10.0,1643985006
44687,Nord,Nord,81.7166,-17.8000,Greenland,GL,GRL,,,10.0,1304217709
44688,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491
44689,San Rafael,San Rafael,-16.7795,-60.6799,Bolivia,BO,BOL,Santa Cruz,,,1068007388


In [15]:
df_ethiopia_ndvi.merge(df_ethiopia_ndwi,
                      left_on=['city', 'date_range'],
                      right_on=['city', 'date_range'])\
                .merge(df_temp,
                      how='left',
                      left_on=['city', 'start_date_x'],
                      right_on=['city', 'date_start'])\
                .to_csv('ALL_DATA_ETHIOPIA.csv')

In [16]:
all_fixed = df_ethiopia_ndvi.merge(df_ethiopia_ndwi,
                      left_on=['city', 'date_range'],
                      right_on=['city', 'date_range'])\
    .merge(df_temp,
          how='left',
          left_on=['city', 'start_date_x'],
          right_on=['city', 'date_start'])\
    .drop(columns=['job_res_ndvi_x', 'job_res_ndwi_x', 'job_res_ndvi_y', 'job_res_ndwi_y'])\
    .merge(cities[['city', 'lat', 'lng', 'country']])

In [17]:
list(all_fixed[all_fixed['city'] == 'Ērer Sātā']['href_job_res_ndvi']

SyntaxError: '(' was never closed (956350402.py, line 1)

In [23]:
from urllib.request import urlretrieve, urlopen
import ssl
import certifi

In [29]:
city = 'Addis Ababa'
path = 'addis_ababa_gif'

try:
    os.mkdir(path)
except FileExistsError:
    pass
    
for url in list(all_fixed[all_fixed['city'] == city]['href_job_res_ndvi']):
    resp = urlopen(url, context=ssl.create_default_context(cafile=certifi.where()))
    with open(os.path.join(path, url.split('/')[-1].split('?')[0]), 'wb') as output:
        output.write(resp.read())

In [24]:
urlopen(url, context=ssl.create_default_context(cafile=certifi.where()))

<http.client.HTTPResponse at 0x115250f10>

In [20]:
urlretrieve?

[0;31mSignature:[0m [0murlretrieve[0m[0;34m([0m[0murl[0m[0;34m,[0m [0mfilename[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mreporthook[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Retrieve a URL into a temporary location on disk.

Requires a URL argument. If a filename is passed, it is used as
the temporary file location. The reporthook argument should be
a callable that accepts a block number, a read size, and the
total file size of the URL target. The data argument should be
valid URL encoded data.

If a filename is passed and the URL points to a local resource,
the result is a copy from local file to new file.

Returns a tuple containing the path to the newly created
data file as well as the resulting HTTPMessage object.
[0;31mFile:[0m      /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py
[0;31mType:[0m      function

In [101]:
all_fixed['city'].value_counts()

city
Addis Ababa    8
Godē           8
Ērer Sātā      8
Nazrēt         8
Bahir Dar      7
Ārba Minch’    7
Shashemenē     6
Desē           6
Sodo           6
Jīma           6
Hosa’ina       6
Harar          6
Name: count, dtype: int64