In [1]:
import boto3
import os
import datetime
import pytz
import lz4.frame
import pandas as pd

from config import ACCESS_KEY, SECRET_KEY, TOKEN

In [2]:
def get_s3_instance():
    session = boto3.session.Session()
    return session.client(
        aws_access_key_id=ACCESS_KEY,
        aws_secret_access_key=SECRET_KEY,
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net'
    )

def upload_dump_to_s3(): # функция выгружает данные в s3
    get_s3_instance().upload_file(
        Filename=TEMP_FILENAME,
        Bucket=BUCKET_NAME,
        Key=key_for_uploading
    )

bucket_name = 'dwh-asgard'
folder = 'hex_metrics_asgard'

s3 = get_s3_instance()

In [3]:
import pandas as pd

start_date = datetime.datetime.strptime('2024-06-14','%Y-%m-%d').date()
end_date = datetime.datetime.strptime('2024-09-06','%Y-%m-%d').date()
dates_pd = pd.DataFrame({
        'date': pd.date_range(start=start_date, end=end_date),
        'date_key': pd.date_range(start=start_date, end=end_date).strftime('year=%Y/month=%m/%#d.csv'),
        'day_of_week': pd.date_range(start=start_date, end=end_date).strftime("%A")
        })

dates_pd = dates_pd[dates_pd['day_of_week'] == 'Monday']
dates_pd = dates_pd.reset_index().drop(['index','day_of_week'], axis=1)
dates_pd

Unnamed: 0,date,date_key
0,2024-06-17,year=2024/month=06/17.csv
1,2024-06-24,year=2024/month=06/24.csv
2,2024-07-01,year=2024/month=07/1.csv
3,2024-07-08,year=2024/month=07/8.csv
4,2024-07-15,year=2024/month=07/15.csv
5,2024-07-22,year=2024/month=07/22.csv
6,2024-07-29,year=2024/month=07/29.csv
7,2024-08-05,year=2024/month=08/5.csv
8,2024-08-12,year=2024/month=08/12.csv
9,2024-08-19,year=2024/month=08/19.csv


In [4]:
for i in range(0,dates_pd.shape[0]):
    time_zone = os.getenv("TIME_ZONE", "Europe/Moscow")
    now = dates_pd.loc[i, ['date']].values[0]
    now = pd.to_datetime(now)
    yesterday = now - datetime.timedelta(days=1)
    six_day_ago = yesterday - datetime.timedelta(days=6)
    df = pd.DataFrame()
    df['date'] = pd.date_range(start=six_day_ago, end=yesterday)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year.astype(str)
    df['month'] = df['date'].dt.month.astype(str)
    df['day'] = df['date'].dt.day.astype(str)
    df['key'] = df['date'].apply(lambda x:x.strftime('hex_metrics_asgard/year=%Y/month=%m/%#d.csv.lz4'))

    df_week = pd.DataFrame()
    for j in range(df.shape[0]):
        filename = f"{df.loc[j,['day']].item()}.csv"
        key = df['key'][j]
        s3.download_file(Bucket=bucket_name,Key=key,Filename=filename)
        with lz4.frame.open(filename, 'r') as file:
            all_data_bit = file.read()
            all_data_txt = all_data_bit.decode('utf-8')
            all_data_txt = all_data_txt.replace('\\n','\n').strip().strip('"')
            

        filename_csv = f"{df.loc[j,['day']].item()}.csv"
        remade_file = open(filename_csv, 'w', encoding='utf-8')
        remade_file.write(all_data_txt)
        
        df_day = pd.read_csv(filename_csv)
        df_week = pd.concat([df_day, df_week])
        remade_file.close()
        os.remove(filename)
        
    df_week = df_week.sort_values('report_date')
    df_week['unique_uuid'] = df_week['intercom_uuid']+'_'+df_week['key_hex']
    df_week['report_date'] = pd.to_datetime(yesterday).strftime('%Y-%m-%d')
    df_week_grouped = df_week.groupby(['intercom_uuid','key_hex','unique_uuid','report_date']).sum().reset_index()

    TEMP_FILENAME = f"{yesterday.strftime('%d.parquet')}"
    df_week_grouped.to_parquet(TEMP_FILENAME)
    BUCKET_NAME = 'aggregated-data'
    key_for_uploading = yesterday.strftime('hex_metrics_asgard_grouped/year=%Y/month=%m/%d.parquet')
    upload_dump_to_s3()
    os.remove(TEMP_FILENAME)

In [5]:
remade_file.close()

In [6]:
df_day

Unnamed: 0,report_date,intercom_uuid,key_hex,count
0,2024-09-01,089df4aa-5d92-4c4b-86a2-c2a9cec73887,7DB12856,3
1,2024-09-01,089df4aa-5d92-4c4b-86a2-c2a9cec73887,DDC73256,1
2,2024-09-01,089df4aa-5d92-4c4b-86a2-c2a9cec73887,0D48A757,1
3,2024-09-01,089df4aa-5d92-4c4b-86a2-c2a9cec73887,0DD1B757,3
4,2024-09-01,089df4aa-5d92-4c4b-86a2-c2a9cec73887,4D091C57,3
...,...,...,...,...
1091527,2024-09-01,fe44a9d8-54c4-4688-88a0-934903f84cd7,2A8BED6C,1
1091528,2024-09-01,fe44a9d8-54c4-4688-88a0-934903f84cd7,AA8AD46D,1
1091529,2024-09-01,fe44a9d8-54c4-4688-88a0-934903f84cd7,1A01EA6C,1
1091530,2024-09-01,fe44a9d8-54c4-4688-88a0-934903f84cd7,9A8ECE72,2
