##### 1. Установим новые библиотеки для получения и работы с данными

In [1]:
%pip install psycopg==3.1.12
%pip install "psycopg[binary,pool]"
%pip install pandas==2.1.1

##### 2. Определим глобальные перменные

In [2]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_nikolaistepanov"
RUN_NAME = "data_check"

In [33]:
import os

import psycopg
import pandas as pd
import mlflow

##### 3. Заберем данные из базы данных и сформируем `dataframe`

In [4]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [5]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

    # ATTENTION: don't use a sctipt below:
    # cur.execute("SELECT column_name FROM information_schema.columns WHERE table_name = 'your_table_name'")

df = pd.DataFrame(data, columns=columns)

##### 4. Базово посмотрим dataframe - залогируем метрики в базу данных и сам `dataframe` в `Object Storage`

##### 4.1. Поднимаем `MLFlow` и подключаемся к нему, как в предыдущем уроке

In [7]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "..."
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."

In [10]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

##### 4.2. Подготовим данные для записи:
- названия колонок
- метрики `dataframe`
- файл с `dataframe`

In [12]:
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,489,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,490,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0


In [13]:
with open("columns.txt", "w", encoding="utf-8") as fio:
    fio.write(", ".join(df.columns))

In [14]:
counts_columns = [
    "type", "paperless_billing", "internet_service", "online_security", "online_backup", "device_protection",
    "tech_support", "streaming_tv", "streaming_movies", "gender", "senior_citizen", "partner", "dependents",
    "multiple_lines", "target"
]

stats = {}

for col in counts_columns:
    column_stat = df[col].value_counts().to_dict()
    column_stat = {f"{col}_{key}": value for key, value in column_stat.items()}
    stats.update(column_stat)


stats["data_length"] = df.shape[0]
stats["monthly_charges_min"] = df["monthly_charges"].min()
stats["monthly_charges_max"] = df["monthly_charges"].max()
stats["monthly_charges_mean"] = df["monthly_charges"].mean()
stats["monthly_charges_median"] = df["monthly_charges"].median()
stats["total_charges_min"] = df["total_charges"].min()
stats["total_charges_max"] = df["total_charges"].max()
stats["total_charges_mean"] = df["total_charges"].mean()
stats["total_charges_median"] = df["total_charges"].median()
stats["unique_customers_number"] = len(df["customer_id"].unique())
stats["end_date_nan"] = df["end_date"].isna().sum()

In [15]:
df.to_csv("users_churn.csv")

In [16]:
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(stats)
    mlflow.log_artifact("columns.txt", "dataframe")
    mlflow.log_artifact("users_churn.csv", "dataframe")

##### 5. Проверяем, что все успешно залогировалось и удаляем файлы с локальной машины

In [17]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
run = mlflow.get_run(run_id)

In [32]:
assert stats.keys() == run.data.metrics.keys()
assert run.info.artifact_uri
assert 'FINISHED' == run.info.status

os.remove("columns.txt")
os.remove("users_churn.csv")