In [1]:
import requests
import time
import json
import os
import datetime
import boto3
import pytz
import pandas as pd
from dateutil.relativedelta import relativedelta

In [2]:
from config import ACCESS_KEY,SECRET_KEY,TOKEN,FOLDER,FOLDER_ID,BUCKET_NAME,BUCKET_NAME

In [3]:
TIME_ZONE = os.getenv("TIME_ZONE", "Europe/Moscow") #настройка функции
TEMP_FILENAME = "temp_file"

headers={'Authorization':TOKEN ,'Accept':'application/json'}

def get_now_datetime_str(): # получаем актуальное время
    time_zone = os.getenv("TIME_ZONE", "Europe/Moscow") # меняем таймзону на московскую
    now = datetime.datetime.now(pytz.timezone(time_zone))    
    yesterday = now - datetime.timedelta(days=1) #нужна вчерашняя дата так как данные за прошлый день
    last_month_data = now - relativedelta(month=1)
    return {'key_parquet': yesterday.strftime('year=%Y/month=%m/%d.parquet'),
            'key': yesterday.strftime('year=%Y/month=%m/%d.csv'),
            'key_month': yesterday.strftime('year=%Y/month=%m.csv'),
            'now':now.strftime('%Y-%m-%d %H:%M:%S'),
            'yesterday_data':yesterday.strftime('%Y-%m-%d'),
            'yesterday':yesterday.strftime('%Y-%m-%d %H:%M:%S'), 
            'year':yesterday.strftime('%Y'),
            'month':yesterday.strftime('%m'),
            'day':yesterday.strftime('%d'),
            'last_month_data':last_month_data.strftime('%Y-%m-%d')
            }

def create_query(): #функция создает новый запрос и возвращает id для запроса результата
    body = {
        "name":query_name, 
        "TYPE":"ANALYTICS", 
        "text":query_text, 
        "description":query_description
    }
    response = requests.post(
        f'https://api.yandex-query.cloud.yandex.net/api/fq/v1/queries?project={FOLDER_ID}',
        headers=headers,
        json=body
    )
    if response.status_code == 200:
        return response.json()["id"]
    return f' Code: {response},  text: {response.text}'


def get_request(offset): # фунция возвращает ответ запроса. Максимум 1000 строк.
    offset = offset
    get_query_results_url = f'https://api.yandex-query.cloud.yandex.net/api/fq/v1/queries/{request_id}/results/0?project={FOLDER_ID}&offset={str(offset)}&limit=1000'
    response = requests.get(
        get_query_results_url,
        headers = headers
    )
    return response

def if_cell_is_list(cell): # функция участвует в преобразовании данных при создании файла
    if isinstance(cell, list):
        if len(cell) == 0:
            return ''
        else: 
            return cell[0]
    else:
        return cell

def write_temp_file():
    offset = 0
    response = get_request(offset) #запрашиваем данные запроса
    columns = [rows['name'] for rows in response.json()['columns']] #выделяем названия столбцов
    special_str = ""
    for j in columns:
        special_str = f"{special_str}{str(j)},"
    temp_file = open(TEMP_FILENAME, 'w')
    temp_file.write(special_str[:-1]+'\n')

def write_temp_file():
    offset = 0
    response = get_request(offset) #запрашиваем данные запроса
    columns = [rows['name'] for rows in response.json()['columns']] #выделяем названия столбцов
    special_str = ""
    for j in columns:
        special_str = f"{special_str}{str(j)},"
    temp_file = open(TEMP_FILENAME, 'w', encoding='utf-8')
    temp_file.write(special_str[:-1]+'\n')

    while response.status_code == 200 and len(response.json()['rows']) != 0:  #Цикл делает запросы по 10000, пока не кончатся данные
        response = get_request(offset)
        response_rows = response.json()['rows']
        rows = [[if_cell_is_list(cell) for cell in row] for row in response_rows]  #Преобразуются строки
        # Открывает созданный файл и добавляет в него строки
        for row in rows:
            special_str = ','.join("'{0}'".format(i.replace("'", ""))  if isinstance(i, str) else str(i) for i in row)
            temp_file.write(special_str+'\n') 
        offset +=1000 # увеличивает смещение

def get_s3_instance(): # функция создает соединение
    session = boto3.session.Session()
    return session.client(
        aws_access_key_id=ACCESS_KEY,
        aws_secret_access_key=SECRET_KEY,
        service_name='s3',
        endpoint_url='https://storage.yandexcloud.net'
    )

def upload_dump_to_s3(): # функция выгружает данные в s3
    get_s3_instance().upload_file(
        Filename=TEMP_FILENAME,
        Bucket=BUCKET_NAME,
        Key=key
    )

def remove_temp_files(): #функция удаляет временный файл
    os.remove(TEMP_FILENAME)

In [4]:
s3 = get_s3_instance()

### flats_st_partner

In [5]:
BUCKET_NAME = "dwh-asgard"
FOLDER = 'flats_st_partner'
key = f"year={int(get_now_datetime_str()['year'])}/month={int(get_now_datetime_str()['month'])}/{int(get_now_datetime_str()['day'])}.csv"
s3_file_name = f"{FOLDER}/{key}"
s3_file_name_folder = '/'.join(s3_file_name.split(sep='/')[0:3])

path_on_pc = 'D:/s3'
local_download_folder = f'{path_on_pc}/{s3_file_name_folder}'
local_download_path = f'{path_on_pc}/{s3_file_name}'

if os.path.isfile(local_download_path) is False:
# создается новая папка, если ее нет
    os.makedirs(local_download_folder, exist_ok=True)
    s3.download_file(Bucket=BUCKET_NAME,Key=s3_file_name,Filename=local_download_path)

In [6]:
flats_st_partner_df = pd.read_csv(f'{local_download_path}')

In [7]:
flats_st_partner_df

Unnamed: 0,report_date,flat_uuid,call_blocked,blocked,deleted
0,2024-10-27,f6c107ef-e972-4c14-ac9c-40b82b5f8395,0,0,0
1,2024-10-27,a498f0b2-b749-4255-9cf4-bf86663dfb60,0,0,0
2,2024-10-27,efc20b04-6a06-4ff1-9feb-27fce2279281,0,0,0
3,2024-10-27,c5ca5697-fae8-47f8-8ab1-34f1b11a9830,0,0,0
4,2024-10-27,4429f3bd-f810-4ef0-97dc-3ca2b7f7955f,0,0,0
...,...,...,...,...,...
808839,2024-10-27,5f703b65-d849-4219-acc1-328afcdc9e60,0,0,0
808840,2024-10-27,4de3f607-4be5-42e5-a586-2ffcfd51bb5b,0,0,0
808841,2024-10-27,63ac45d7-5595-4e3c-9123-fe3076f04071,0,0,0
808842,2024-10-27,6091a658-c9a2-4ebd-9217-0b150b01fd31,0,0,0


In [12]:
BUCKET_NAME = "dwh-asgard"
FOLDER = 'flats_dir_partner'
s3_file_name = f"{FOLDER}/flats_dir_partner.csv"
s3_file_name_folder = '/'.join(s3_file_name.split(sep='/')[0:1])

path_on_pc = 'D:/s3'
local_download_folder = f'{path_on_pc}/{s3_file_name_folder}'
local_download_path = f'{path_on_pc}/{s3_file_name}'

if os.path.isfile(local_download_path) is False:
# создается новая папка, если ее нет
    os.makedirs(local_download_folder, exist_ok=True)
    s3.download_file(Bucket=BUCKET_NAME,Key=s3_file_name,Filename=local_download_path)

In [13]:
flats_dir_partner_df = pd.read_csv(f'{local_download_path}')

In [14]:
flats_dir_partner_df

Unnamed: 0,created_at,number,address_uuid,installation_point_id,flat_uuid
0,2018-05-25 08:07:14,23,f59b9539-3cba-4818-af1c-b6b14e910db4,1665,f6c107ef-e972-4c14-ac9c-40b82b5f8395
1,2018-05-25 08:11:16,65,2c8ae56a-8938-4c9c-b3ac-a43caef5786e,1682,a498f0b2-b749-4255-9cf4-bf86663dfb60
2,2018-05-25 08:36:56,15,c711c9ff-c0ac-44c2-98e7-8c39ead87453,325,efc20b04-6a06-4ff1-9feb-27fce2279281
3,2018-05-25 08:39:02,2,501b7f66-eb3d-410f-96d0-da75fc9fdc0b,1667,c5ca5697-fae8-47f8-8ab1-34f1b11a9830
4,2018-05-25 08:40:27,25,501b7f66-eb3d-410f-96d0-da75fc9fdc0b,1670,4429f3bd-f810-4ef0-97dc-3ca2b7f7955f
...,...,...,...,...,...
724665,2024-05-07 01:49:22,40,db50d44a-be51-487c-9d9a-819fed1a6a9a,759751,0cb1a829-ec6b-421b-9365-46d2f0075fa0
724666,2024-05-07 01:53:57,139,17c34221-4694-4d79-881d-01b213f26757,759752,7f7d7ea4-2fec-4da3-b98d-55f612ec78cd
724667,2024-05-07 01:55:16,23,edf61306-e604-4846-ac49-2cdca59befe4,759753,f0909c1a-80dc-44de-987b-8615c9855519
724668,2024-05-07 02:19:41,2,e3cc71e4-96f5-4190-b422-ae04b6379fd6,759754,7816cb6c-3f0f-4134-b091-a5ef85b6d4a5


In [15]:
flats_st_partner_df = flats_st_partner_df.merge(
    flats_dir_partner_df[['flat_uuid','address_uuid']],
    on='flat_uuid',
    how='left'
)

In [16]:
flats_st_partner_df['address_uuid'].nunique()

25617

### installation_point_st_partner

In [18]:
BUCKET_NAME = "dwh-asgard"
FOLDER = 'installation_point_st_partner'
key = f"year={int(get_now_datetime_str()['year'])}/month={int(get_now_datetime_str()['month'])}/{int(get_now_datetime_str()['day'])}.csv"
s3_file_name = f"{FOLDER}/{key}"
s3_file_name_folder = '/'.join(s3_file_name.split(sep='/')[0:3])

path_on_pc = 'D:/s3'
local_download_folder = f'{path_on_pc}/{s3_file_name_folder}'
local_download_path = f'{path_on_pc}/{s3_file_name}'

if os.path.isfile(local_download_path) is False:
# создается новая папка, если ее нет
    os.makedirs(local_download_folder, exist_ok=True)
    s3.download_file(Bucket=BUCKET_NAME,Key=s3_file_name,Filename=local_download_path)

In [19]:
installation_point_st_partner_df = pd.read_csv(f'{local_download_path}')

In [20]:
installation_point_st_partner_df['installation_point_id'].nunique()

33715

In [22]:
BUCKET_NAME = "dwh-asgard"
FOLDER = 'entries_installation_points_dir_partner'
s3_file_name = f"{FOLDER}/entries_installation_points_dir_partner.csv"
s3_file_name_folder = '/'.join(s3_file_name.split(sep='/')[0:1])

path_on_pc = 'D:/s3'
local_download_folder = f'{path_on_pc}/{s3_file_name_folder}'
local_download_path = f'{path_on_pc}/{s3_file_name}'

if os.path.isfile(local_download_path) is False:
# создается новая папка, если ее нет
    os.makedirs(local_download_folder, exist_ok=True)
    s3.download_file(Bucket=BUCKET_NAME,Key=s3_file_name,Filename=local_download_path)

In [23]:
entries_installation_points_dir_partner_df = pd.read_csv(f'{local_download_path}')

In [24]:
installation_point_st_partner_df = installation_point_st_partner_df.merge(
    entries_installation_points_dir_partner_df[['installation_point_id','address_uuid','city','country','region','parent_uuid']],
    on='installation_point_id',
    how='left'
)

In [25]:
installation_point_st_partner_df['address_uuid'].nunique()

30156

### merge

In [None]:
flats_st_partner_df_merged = flats_st_partner_df.merge(
    installation_point_st_partner_df[['installation_point_id','address_uuid','city','country','region','parent_uuid']],
    on='address_uuid',
    how='left'
)

In [10]:
BUCKET_NAME = "aggregated-data"
FOLDER = 'flats_research_dashboard'

local_download_folder= f"{path_on_pc}/{FOLDER}"
local_download_path = f"{path_on_pc}/{FOLDER}/{get_now_datetime_str()['key_parquet']}"

os.makedirs(local_download_folder, exist_ok=True)

OSError: [WinError 123] Синтаксическая ошибка в имени файла, имени папки или метке тома: ':'

In [53]:
flats_st_partner_df_merged.to_parquet(f"{path_on_pc}/{FOLDER}/{get_now_datetime_str()['key_parquet']}", compression='snappy', index=False) 

In [55]:
s3_file_path = f"{FOLDER}/{get_now_datetime_str()['key_parquet']}"
pc_file_path = f"{path_on_pc}/{FOLDER}/{get_now_datetime_str()['key_parquet']}"

s3.upload_file(pc_file_path, BUCKET_NAME, s3_file_path)