# Python for Data Science
## Session 5 
### Basic Libraries II

### Exercise


Reusing the same annotations we work with in the previous session, answer the following items using the libraries we saw today: 

1. How many annotations you have per month and year. Which month has more annotation files.

In [12]:
import calendar
import os 
import re

count_month_year = {}
count_month = {}

pattern = re.compile(r'(\d{8})_(\d{6})_SN(\d+)_QUICKVIEW_VISUAL_([\d_]+)_([A-Za-z0-9\-_.]+)\.txt')

folder = 'session_4/annotations'

for annotation in os.listdir(folder):
    #Iterate through only files with the correct naming convention
    if pattern.match(annotation):

        month_year = annotation[:6] #Naming convention is that first 6 characters = YYYYMM

        if month_year in count_month_year:
            count_month_year[month_year] += 1
        else:
            count_month_year[month_year] = 1

        month = annotation[4:6] #Naming convention is that 5th-6th characters = MM
        if month in count_month:
            count_month[month] += 1
        else:
            count_month[month] = 1

#Print # of annotations per month and year, in order of months
for y in sorted(count_month_year.keys()):
    year = int(y[:4])
    month = int(y[4:6])
    print(f'{calendar.month_name[month]} {year} has {count_month_year[y]} annotations') 

print() #for output readability

#Print month with the most annotation files
most_annotations = max(count_month, key = count_month.get)
print(f'{calendar.month_name[int(most_annotations)]} has the most annotations files, with {count_month[most_annotations]} files')


January 2024 has 27 annotations
February 2024 has 45 annotations
March 2024 has 17 annotations
April 2024 has 25 annotations
May 2024 has 28 annotations
June 2024 has 52 annotations

June has the most annotations files, with 52 files


2. Create a dictionary where each **key** is a month, and the corresponding **value** is a list containing all the annotation names with where their date corresponds to the month. 
    a. Save it following the json format, and load it again to check that everything is ok.
    b. Save it this time using Pickle.
    c. Instead of storing a list of all the annotation names happening that month, let's create for each annotation a dictionary with keys: name and date (using a datetime object).

In [None]:
import json 
import pickle
from datetime import datetime

monthly_annotations = {}

for annotation in os.listdir(folder):
    #Iterate through only files with the correct naming convention
    match = pattern.match(annotation)
    if match:
        date_str = match.group(1)
        date = datetime.strptime(date_str, '%Y%m%d')
        month = date.strftime('%Y-%m')
        
        if month not in monthly_annotations:
            monthly_annotations[month] = []

        monthly_annotations[month].append(annotation)

#Save the data following the json format
with open('annotations.json', 'w') as json_file: 
    json.dump(monthly_annotations, json_file, default=lambda o: o.__str__()) #Need to convert datetime to str for JSON

with open('annotations.json', 'r') as json_file:
    json_data = json.load(json_file)

#Print outputs
print('JSON Data')
for month, annotations in json_data.items():
    print(f'Month: {month}')
    for annotation in annotations:
        print(f'Annotation: {annotation}')
    print() #for output readability

JSON Data
Month: 2024-01
Annotation: 20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt
Annotation: 20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt
Annotation: 20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt
Annotation: 20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt
Annotation: 20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt
Annotation: 20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt
Annotation: 20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt
Annotation: 20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt
Annotation: 20240130_173903_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_366_3756.txt
Annotation: 20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_500_3600.txt
Annotation: 20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt
Annotation: 20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-

In [None]:
#Save the data using Pickle
with open('annotations.pkl', 'wb') as pickle_file:
    pickle.dump(monthly_annotations, pickle_file)

with open('annotations.pkl', 'rb') as pickle_file:
    pickle_data = pickle.load(pickle_file)

#Print outputs
print('Pickle Data')
print(pickle_data)
for month, annotations in pickle_data.items():
    print(f'Month: {month}')
    for annotation in annotations:
        print(f'Annotation: {annotation}')
    print() #for output readability

Pickle Data
{'2024-01': ['20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt', '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt', '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt', '20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt', '20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt', '20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt', '20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt', '20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt', '20240130_173903_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_366_3756.txt', '20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_500_3600.txt', '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4162.txt', '20240127_190620_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_500_3602.txt', '20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_554_4162.txt', '20240101_213

In [15]:
annotation_dict = {}

for annotation in os.listdir(folder):
    #Iterate through only files with the correct naming convention
    match = pattern.match(annotation)
    if match:
        date_str, time, satellite_number, version, unique_region = match.groups()
        date = datetime.strptime(date_str, '%Y%m%d')
        month = date.strftime('%Y-%m')

        annotation_values = {'annotation': annotation, 'date': date}
        
        if month not in annotation_dict:
            annotation_dict[month] = []

        annotation_dict[month].append(annotation_values)

#Print outputs
print('Annotation Dictionary')
for month, annotations in annotation_dict.items():
    print(f'Month: {month}')
    for annotation in annotations:
        print(f"Annotation: {annotation['annotation']}, Date: {annotation['date']}")
    print() #for output readability

Annotation Dictionary
Month: 2024-01
Annotation: 20240102_185527_SN27_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_740_3850.txt, Date: 2024-01-02 00:00:00
Annotation: 20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3770.txt, Date: 2024-01-01 00:00:00
Annotation: 20240101_192856_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_552_4164.txt, Date: 2024-01-01 00:00:00
Annotation: 20240102_185954_SN24_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_414_3786.txt, Date: 2024-01-02 00:00:00
Annotation: 20240104_220339_SN31_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-10N_556_4178.txt, Date: 2024-01-04 00:00:00
Annotation: 20240115_213834_SN28_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_376_3722.txt, Date: 2024-01-15 00:00:00
Annotation: 20240126_173752_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_386_3722.txt, Date: 2024-01-26 00:00:00
Annotation: 20240101_174301_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_404_3772.txt, Date: 2024-01-01 00:00:00
Annotation: 20240130_173903_SN33_QUICKVIEW_VISUAL_1_1_10_SATL-2KM-11N_366_3756.txt,

3. Print all the annotations from the oldest ones to the newest one during the seconf half of the 2024. 

In [16]:
annotation_dict = {}

for annotation in os.listdir(folder):
    #Iterate through only files with the correct naming convention
    match = pattern.match(annotation)
    if match:
        date_str, time, satellite_number, version, unique_region = match.groups()
        date = datetime.strptime(date_str, '%Y%m%d')
        month = date.strftime('%Y-%m')

        annotation_values = {'annotation': annotation, 'date': date}
        
        if month not in annotation_dict:
            annotation_dict[month] = []

        annotation_dict[month].append(annotation_values)

annotation_2nd_half_2024 = []

#Filter to only second half of 2024
for month in annotation_dict:
    if '2024-07' <= month <= '2024-12':
        annotation_2nd_half_2024.extend(annotation_dict[month])

#Sort by date using lambda
annotation_2nd_half_2024.sort(key = lambda x:x['date'])

if not annotation_2nd_half_2024:
    print('No annotation found in the 2nd half of 2024')
else:
    print('These are all the annotations from oldest to newest during the 2nd half of 2024:')
    for annotation in annotation_2nd_half_2024:
        print(f"Annotation: {annotation['annotation']}, Date: {annotation['date']}")


No annotation found in the 2nd half of 2024
