In [1]:
from skt.gcp import (
    PROJECT_ID,
    bq_insert_overwrite,
    bq_to_df,
    bq_to_pandas,
    get_bigquery_client,
    bq_table_exists,
    get_max_part,
    load_query_result_to_table,
    pandas_to_bq,
    pandas_to_bq_table,
    load_bigquery_ipython_magic,
    get_bigquery_client,
    _print_query_job_results,
    load_query_result_to_partitions
    
)

from skt.ye import (
    get_hdfs_conn,
    get_spark,
    hive_execute,
    hive_to_pandas,
    pandas_to_parquet,
    slack_send
)
from skt.github_utils import GithubUtil
from skt.vault_utils import get_secrets


In [2]:
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from dateutil.relativedelta import relativedelta
from pyhive import hive

from copy import deepcopy
from joblib import Parallel, delayed
import os
import sys
from git import Repo
from contextlib import contextmanager
from tqdm.notebook import tqdm
import warnings

In [3]:
secrets = get_secrets('github/sktaiflow')
token = secrets['token']

proxies = {
    'http': secrets['proxy'],
    'https': secrets['proxy'],
}

# Customer func for cloning git modules

In [4]:
@contextmanager
def proxy(proxies):
    env_backup = dict(os.environ)
    os.environ["HTTP_PROXY"] = proxies["http"]
    os.environ["HTTPS_PROXY"] = proxies["https"]
    yield
    os.environ.clear()
    os.environ.update(env_backup)

In [5]:
def slack_sending(channel_name:str, msg:str="test", is_adot:bool=True):
    if "#" not  in channel_name:
        channel_name += "#" + channel_name

    slack_send(
        text=msg,
        username="SKT",
        channel=channel_name,
        icon_emoji=":large_blue_circle:",
        blocks=None,
        dataframe=False,
        adot=is_adot
    )


In [6]:
class GithubUtil_custom(GithubUtil):
    def __init__(self, token, proxies, **kwargs):
        super().__init__(token, proxies)
    
    def clone_from_repo(self, git_url, branch="main", git_save_path="/temp"):
        try:
            if self._proxies:
                with proxy(self._proxies):
                    response = Repo.clone_from(git_url, git_save_path, branch=branch)
                    return response
            else:
                msg = f"proxy must be passed"
                raise Exception(msg)    
        except Exception as e:
            msg = f"cloning git repo:{git_url} branch:{branch} failed {e}"
            slack_sending(msg=msg, channel_name=channel_name, is_adot=True)
            raise Exception(msg)

In [9]:
git_url ='https://github.com/sktaiflow/onemodelV3-opensearch-engine.git'
branch = 'develop'
git_save_path = '/home/x1112436/shared/1112436/git'

In [12]:
gitobj= GithubUtil_custom(token=token, proxies=proxies)

In [13]:
response = gitobj.clone_from_repo(git_url=git_url, branch=branch, git_save_path=git_save_path)

In [14]:
module_path = os.path.join(git_save_path, "dags")
sys.path.append(module_path)

In [15]:
## import from module
from onemodelV3.opensearch_engine.indexing_engine.preprocessor import OpensearchPreprocessor
from onemodelV3.opensearch_engine.indexing_engine.func import *

ModuleNotFoundError: No module named 'datasets'

In [None]:
# get file list
def get_gzip_files(directory='./temp/indexing/input'):
    from pathlib import Path
    gzip_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".gzip"):
                file_path = os.path.join(root, file)
                gzip_files.append(file_path)

    return gzip_files

file_list = get_gzip_files()

In [None]:
for file in file_list:
    dataset = OpensearchPreprocessor.load(file)