## Import libs

In [30]:

import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("app.log"),          # File output
        logging.StreamHandler()                  # Console output
    ],
    force=True  # This overrides any prior logging config
)

import clickhouse_connect
import requests
import pandas as pd
import dateutil

import sys
import os

from pydantic import BaseModel, Field

from typing import List
import os
import uuid

from dotenv import load_dotenv
load_dotenv()


CLICKHOUSE_USER=os.getenv("CLICKHOUSE_USER")
CLICKHOUSE_PASSWORD=os.getenv("CLICKHOUSE_PASSWORD")
CH_IP=os.getenv("CH_IP")
CH_PORT=os.getenv("CH_PORT")  
CLICKHOUSE_DB=os.getenv("CLICKHOUSE_DB") 

# Initialize ClickHouse client
client_ch = clickhouse_connect.get_client(
    host=CH_IP,
    port=CH_PORT,     
    username=CLICKHOUSE_USER,
    password=CLICKHOUSE_PASSWORD,
    database=CLICKHOUSE_DB
)


# import tools
# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# find tools in parent dir
if os.path.isdir(os.path.join(parent_dir, 'tools')):
    # Add parent directory to sys.path if found
    sys.path.append(parent_dir)
    
else:
    # for run in spark
    parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../airflow/airflow_data"))
    
    # Add parent directory to sys.path
    sys.path.append(parent_dir)


from tools import pd_tools
from tools.paths import Paths
from tools.db_tools import DbTools




Вам предоставлена таблица air_quality с данными о качестве воздуха в различных районах города. База содержит информацию об измерениях различных загрязнителей воздуха (PM2.5, NO2, O3 и др.) и связанных с ними показателях.

Структура таблицы air_quality
    - 1. Name - название показателя (PM2.5, NO2, O3 и др.)
    - 2. Geo_Place_Name - место измерения
    - 3. Start_Date - дата измерения
    - 4. Data_Value - измеренное значение

In [7]:
air_quality = pd.read_csv('data/Air_Quality.csv')
air_quality.head()

Unnamed: 0,Unique ID,Indicator ID,name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Unnamed: 10,Data Value
0,419355,365,Fine particles (PM 2.5),Estimated annual rate,per square mile,Citywide,208.0,St. George and Stapleton (CD1),Summer 2010,12/31/2022,,0
1,542128,365,Fine particles (PM 2.5),Estimated annual rate (age 30+),"per 100,000",Borough,404406.0,Northern SI,Summer 2010,12/31/2022,,47
2,419346,640,Nitrogen dioxide (NO2),Number per km2,"per 100,000",CD,307.0,Fresh Meadows,Summer 2011,31.12.2022,,0
3,419347,365,Boiler Emissions- Total PM2.5 Emissions,Estimated annual rate (age 30+),"per 100,000 adults",UHF42,305.0,Southeast Queens,Annual Average 2009,31.12.2022,,0
4,419348,367,Ozone (O3),Estimated annual rate (age 30+),ppb,UHF42,105.0,Greenpoint and Williamsburg (CD1),Annual Average 2011,31.12.2022,,0


## Data analyze

In [9]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134788 entries, 0 to 134787
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unique ID       134788 non-null  int64  
 1   Indicator ID    134788 non-null  int64  
 2   name            134788 non-null  object 
 3   Measure         134788 non-null  object 
 4   Measure Info    134788 non-null  object 
 5   Geo Type Name   134788 non-null  object 
 6   Geo Join ID     134779 non-null  float64
 7   Geo Place Name  134779 non-null  object 
 8   Time Period     134788 non-null  object 
 9   Start_Date      134788 non-null  object 
 10  Unnamed: 10     0 non-null       float64
 11  Data Value      134788 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 12.3+ MB


In [10]:
air_quality = air_quality.drop(columns='Unnamed: 10')

In [11]:
pd_tools.df_info(air_quality)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134788 entries, 0 to 134787
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unique ID       134788 non-null  int64  
 1   Indicator ID    134788 non-null  int64  
 2   name            134788 non-null  object 
 3   Measure         134788 non-null  object 
 4   Measure Info    134788 non-null  object 
 5   Geo Type Name   134788 non-null  object 
 6   Geo Join ID     134779 non-null  float64
 7   Geo Place Name  134779 non-null  object 
 8   Time Period     134788 non-null  object 
 9   Start_Date      134788 non-null  object 
 10  Data Value      134788 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 11.3+ MB
None


'First 5 rows in df'

Unnamed: 0,Unique ID,Indicator ID,name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value
0,419355,365,Fine particles (PM 2.5),Estimated annual rate,per square mile,Citywide,208.0,St. George and Stapleton (CD1),Summer 2010,12/31/2022,0
1,542128,365,Fine particles (PM 2.5),Estimated annual rate (age 30+),"per 100,000",Borough,404406.0,Northern SI,Summer 2010,12/31/2022,47
2,419346,640,Nitrogen dioxide (NO2),Number per km2,"per 100,000",CD,307.0,Fresh Meadows,Summer 2011,31.12.2022,0
3,419347,365,Boiler Emissions- Total PM2.5 Emissions,Estimated annual rate (age 30+),"per 100,000 adults",UHF42,305.0,Southeast Queens,Annual Average 2009,31.12.2022,0
4,419348,367,Ozone (O3),Estimated annual rate (age 30+),ppb,UHF42,105.0,Greenpoint and Williamsburg (CD1),Annual Average 2011,31.12.2022,0


Column Unique ID has only one type: <class 'int'>
Column Indicator ID has only one type: <class 'int'>
Column name has only one type: <class 'str'>
Column Measure has only one type: <class 'str'>
Column Measure Info has only one type: <class 'str'>
Column Geo Type Name has only one type: <class 'str'>
Column Geo Join ID has only one type: <class 'float'>
Column Geo Place Name has only one type: <class 'str'>
Column Time Period has only one type: <class 'str'>
Column Start_Date has only one type: <class 'str'>
Column Data Value has only one type: <class 'str'>


Unnamed: 0,duplicates_full
duplicates_full,0


Unnamed: 0,Unique ID,Indicator ID,name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value
duplicates_by_cols,0,134765,134770,134780,134779,134783,134707,134664,134732,128817,134424


zeroes


minus_ones


Unnamed: 0,Geo Join ID,Geo Place Name
nulls,9,9


Unnamed: 0,Geo Join ID,Geo Place Name
nans,9,9


nones


NA placeholder


null placeholder


N/A placeholder


In [12]:
air_quality.columns

Index(['Unique ID', 'Indicator ID', 'name', 'Measure', 'Measure Info',
       'Geo Type Name', 'Geo Join ID', 'Geo Place Name', 'Time Period',
       'Start_Date', 'Data Value'],
      dtype='object')

## Preprocess data

In [15]:
# rename columns
air_quality = air_quality.rename(columns={
    'Unique ID': 'unique_id',
    'Indicator ID': 'indicator_id',
    'Measure': 'measure',
    'Measure Info': 'measure_info',
    'Geo Type Name': 'geo_type_name',
    'Geo Join ID': 'geo_join_id',
    'Geo Place Name': 'geo_place_name',
    'Time Period': 'time_period',
    'Start_Date': 'start_date',
    'Data Value': 'data_value'
})


In [16]:
air_quality.dtypes

unique_id           int64
indicator_id        int64
name               object
measure            object
measure_info       object
geo_type_name      object
geo_join_id       float64
geo_place_name     object
time_period        object
start_date         object
data_value         object
dtype: object

In [None]:
# change data type of columns
air_quality['data_value'] = air_quality['data_value'].astype(float)

In [29]:
air_quality['start_date']

0         12/31/2022
1         12/31/2022
2         31.12.2022
3         31.12.2022
4         31.12.2022
             ...    
134783     2/18/2023
134784     1/25/2023
134785     1/16/2023
134786     1/15/2023
134787     1/10/2023
Name: start_date, Length: 134788, dtype: object

In [32]:
air_quality['start_date'] = air_quality['start_date'].apply(lambda x: dateutil.parser.parse(x))
air_quality['start_date'].head()

0   2022-12-31
1   2022-12-31
2   2022-12-31
3   2022-12-31
4   2022-12-31
Name: start_date, dtype: datetime64[ns]

In [33]:
air_quality.to_csv("data/air_quality_norm.csv", index=False)

start_date
2023-12-30     23
2023-12-29     25
2023-12-28     17
2023-12-27     18
2023-12-26     16
             ... 
2005-01-13     22
2005-01-04     22
2005-01-03     23
2005-01-02     22
2005-01-01    919
Name: count, Length: 5963, dtype: int64

In [None]:
pd.to_datetime(air_quality['start_date'], format='%Y-%m-%d')

    1. ▶️Найдите все локации (Geo Place Name), где среднее значение озона (name = 'Ozone (O3)') превышает 65 и количество измерений не менее 255. Сколько таких локаций. 
Для сохранения информации о загрязнителях в CRM Битрикс24 потребуется создать пользовательские поля для компаний. В таблицах базы данных Битрикс24 эта информация будет храниться в: