In [393]:
import os
import yaml
import pandas as pd
import numpy as np
import azureml.core
from azureml.core import Workspace, Datastore
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
from azure.ai.ml import command
from azure.ai.ml import Input, Output
from azure.ai.ml import load_component
from azure.ai.ml.entities import Environment
from datetime import datetime, timedelta

In [394]:
# Read the YAML file
with open('./api.yaml', 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)

# Access the API keys and other configuration data
weaviate_url = data.get('weaviate').get('url')
weaviate_api_key = data.get('weaviate').get('api_key')
cohere_api_key = data.get('cohere').get('api_key')
openai_api_key = data.get('openai').get('api_key')
serper_api_key = data.get('serper').get('api_key')

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["SERPER_API_KEY"] = serper_api_key
SUBSCRIPTION = data.get('azure').get('subscription_id')
RESOURCE_GROUP = data.get('azure').get('resource_group_name')
WS_NAME = data.get('azure').get('workspace_name')

In [395]:
ws = Workspace.from_config()

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=SUBSCRIPTION,
    resource_group_name=RESOURCE_GROUP,
    workspace_name=WS_NAME,
)

In [396]:
pipeline_dir = "./feeds_pipeline"
os.makedirs(pipeline_dir, exist_ok=True)

In [397]:
%%writefile {pipeline_dir}/conda.yaml
name: model-env
channels:
    - conda-forge
    - anaconda
dependencies:
    - python=3.10
    - pip:
        - mlflow
        - crewai
        - langchain   #==0.2.5
        - langchain_openai   #==0.1.8
        - matplotlib
        - openai    #==1.35.13
        - plotly
        - PyYAML
        - numpy
        - scikit-learn
        - scipy
        - pandas
        - tiktoken
        - unstructured
        - weaviate-client
        - azureml-mlflow
        - inference-schema[numpy-support]

Overwriting ./feeds_pipeline/conda.yaml


In [398]:
# custom_env_name = "linkedin-learn"

# # pipeline_job_env=Environment.from_pip_requirements(custom_env_name, '../../../../test_script/requirements.txt')
# pipeline_job_env = Environment(
#     name=custom_env_name,
#     description="Custom environment for newsletter Defaults pipeline",
#     conda_file=os.path.join(pipeline_dir, "conda.yaml"),
#     image="mcr.microsoft.com/azureml/curated/sklearn-1.5:2",
# )
# pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

# print(
#     f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
# )

In [399]:
ml_models_src_dir = "./feeds_components/ml_models"
os.makedirs(ml_models_src_dir, exist_ok=True)

In [400]:
# import pandas as pd
# from azure.ai.ml.entities import Data
# from azure.ai.ml.constants import AssetTypes
# import ast
# import json
# data_asset = ml_client.data.get(name="linkedin_profile", version=v1)
# print(f"Data asset URI: {data_asset.path}")

# # read into pandas - note that you will see 2 headers in your data frame - that is ok, for now

# data = pd.read_csv(data_asset.path)
# def parse_literal(column_value):
#     try:
#         return ast.literal_eval(column_value)
#     except (ValueError, SyntaxError):
#         return column_value

# profile_dict = {}

# for index, row in data.iterrows():
#     row_dict = row.to_dict()
#     for key, value in row_dict.items():
#         if isinstance(value, str) and (value.startswith('{') or value.startswith('[')):
#             row_dict[key] = parse_literal(value)

#         # break
#     text_str = json.dumps(row_dict)
#     profile_dict[f'person_{index+1}'] = text_str
# list(profile_dict.values())

In [401]:
%%writefile {ml_models_src_dir}/interest_and_topic.py
import pandas as pd
import json
import os
import ast
import logging
import mlflow
import argparse

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from weaviate.classes.query import HybridFusion
from weaviate.classes.query import MetadataQuery


def parse_literal(column_value):
    try:
        return ast.literal_eval(column_value)
    except (ValueError, SyntaxError):
        return column_value

def main():
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--profile", type=str, help="path to linkedin profile")
    parser.add_argument("--data_folder", type=str, help="path to data folder")
    args = parser.parse_args()

    input_path = args.profile
    loader = CSVLoader(input_path,csv_args={
        'delimiter': '\t'})
    data = loader.load()

    llm = OpenAI(max_tokens=1500)

    chain = load_summarize_chain(llm, chain_type="refine", verbose=True)

    understanding_result = chain.invoke(data)["output_text"]
        
    interests = []
    topics=[]
    profiles = understanding_result
    for summary in understanding_result:
        prompt = (f"Given the following profile information, summarize five key personal interests as keywords (which may involves this person's ) and five potential topics the person might want to learn more about.\n"
                f"profile: ###\n {summary}\n ###\n"
                f"Desired format:\n"
                f"Interest Keywords:\n"
                f"1. \n 2. \n 3. \n 4. \n 5. \n"
                f"Topics:\n"
                f"1. \n 2. \n 3. \n 4. \n 5. \n"
                f"For example:"
                f"profile:###\n {summary}\n ###\n"
                f"Interest Keywords:\n"
                f"1. \n 2. \n 3. \n 4. \n 5. \n"
                f"Topics:\n"
                f"1. \n 2. \n 3. \n 4. \n 5. \n"
                )

        result = llm.invoke(prompt)

        result_list = result.split('\n')
        interests.append("#".join(result_list[2:7]))
        topics.append("#".join(result_list[8:13]))
    data["interest"] = interest
    data["topics"] = topics
    df.to_csv(os.path.join(args.data_folder, "interest_and_topic.csv"), index=False)

if __name__ == "__main__":
    main()

Overwriting ./feeds_components/ml_models/interest_and_topic.py


In [402]:
%%writefile {ml_models_src_dir}/interest_and_topic.yml
# <component>
name: interest_and_topic
display_name: predict interest and topic based on profile
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  profile: 
    type: uri_file
outputs:
  data_folder: 
    type: uri_folder
code: .
environment:
  azureml:linkedin-learn:2
command: >-
  python interest_and_topic.py 
  --profile ${{inputs.profile}} 
  --data_folder ${{outputs.data_folder}}
# </component>

Overwriting ./feeds_components/ml_models/interest_and_topic.yml


In [403]:
# Loading the component from the yml file
interest_and_topic_component = load_component(source=os.path.join(ml_models_src_dir, "interest_and_topic.yml"))

# Now we register the component to the workspace
interest_and_topic_component = ml_client.create_or_update(interest_and_topic_component)

# Create (register) the component in your workspace
print(
    f"Component {interest_and_topic_component.name} with Version {interest_and_topic_component.version} is registered"
)

# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output

data_asset = ml_client.data.get(name="linkedin_profile", version='1')
print(f"Data asset URI: {data_asset.path}")

url_folder = ml_client.data.get("newsletter_v1", version="1").path
data_folder = Output(type="uri_folder", path=url_folder,mode="rw_mount")


@dsl.pipeline(
    compute="serverless",  # "serverless" value runs pipeline on serverless compute
    description="profile analysis",
)
def profile_analysis_pipeline(
    pipeline_job_profile,
):
    # using data_prep_function like a python call with its own inputs
    interest_and_topic_job = interest_and_topic_component(
        profile=pipeline_job_profile,
    )

    # example how to change path of output on pipeline level
    interest_and_topic_job.outputs.data_folder = data_folder

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "JSON_FORMAT_curation": interest_and_topic_job.outputs.data_folder,
    }

# Let's instantiate the pipeline with the parameters of our choice
pipeline = profile_analysis_pipeline(
    pipeline_job_profile=Input(type="uri_file", path=data_asset.path),
)

# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="linkedin-learn-exp-1",
)
ml_client.jobs.stream(pipeline_job.name)

Component interest_and_topic with Version 2024-09-03-10-55-19-2149333 is registered
Data asset URI: azureml://subscriptions/541beb67-718e-41c5-958e-8cc0ba95b210/resourcegroups/awesome_rag_dev/workspaces/rag_book_demo/datastores/workspaceblobstore/paths/UI/2024-09-03_083318_UTC/LinkedIn_Dataset.tsv
RunId: cyan_fly_nyppgm56gc
Web View: https://ml.azure.com/runs/cyan_fly_nyppgm56gc?wsid=/subscriptions/541beb67-718e-41c5-958e-8cc0ba95b210/resourcegroups/awesome_rag_dev/workspaces/rag_book_demo

Streaming logs/azureml/executionlogs.txt

[2024-09-03 10:55:24Z] Submitting 1 runs, first five are: 314c4a12:d63d72d2-12af-4110-9b42-e1e51ce7e9ad
[2024-09-03 11:01:49Z] Execution of experiment failed, update experiment status and cancel running nodes.

Execution Summary
RunId: cyan_fly_nyppgm56gc
Web View: https://ml.azure.com/runs/cyan_fly_nyppgm56gc?wsid=/subscriptions/541beb67-718e-41c5-958e-8cc0ba95b210/resourcegroups/awesome_rag_dev/workspaces/rag_book_demo


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Pipeline has failed child jobs. Failed nodes: /interest_and_topic_job. For more details and logs, please go to the job detail page and check the child jobs.",
        "message_format": "Pipeline has failed child jobs. {0}",
        "message_parameters": {},
        "reference_code": "PipelineHasStepJobFailed",
        "details": []
    },
    "environment": "eastus",
    "location": "eastus",
    "time": "2024-09-03T11:01:49.500025Z",
    "component_name": ""
} 

In [None]:
################### pipeline of personal feed is to be continued################

# # Connect to your Weaviate instance
# import weaviate
# from weaviate.classes.init import Auth
# import os

# # Connect to your Weaviate instance

# # Check if your instance is live and ready
# # This should return `True`
# client.is_ready()

# collection = client.collections.get("Article")

# def hybrid_query_weaviate(query, collection_name, alpha_val):
    
#     collection = client.collections.get(collection_name)

#     response = collection.query.hybrid(
#         query=query, 
#         alpha=alpha_val, 
#         limit=5,
#         return_metadata=MetadataQuery(score=True),
#         fusion_type=HybridFusion.RANKED
#     )

#     return response.objects

# print("Personalized Feeds:\n")
# output_file_path = 'personalized_feeds.txt'
# with open(output_file_path, 'w') as file:
#     file.write("Personalized Feeds:\n\n")

#     for interest in interests_list[2:7]:
        
#         query_result = hybrid_query_weaviate(interest, "Article", 0.5)
#         print(f"'{interest}' feed:")
#         file.write(f"'{interest}' feed:\n")

#         for i, o in enumerate(query_result):
#             article = o.properties
#             score = o.metadata.score
#             print(f"({i+1}) {article['title']}:\n {article['content']}\n (Score: {score})")
#             file.write(f"({i+1}) {article['title']}:\n {article['content']}\n (Score: {score})\n")
#         file.write("----------\n\n")
#         print("----------")
#         print("\n")


In [None]:
%%writefile {ml_models_src_dir}/embedding.py
import os
import tiktoken
import textwrap as tr
from typing import List, Optional
import matplotlib.pyplot as plt
import plotly.express as px
from scipy import spatial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score, precision_recall_curve
from openai import OpenAI
import numpy as np
import pandas as pd
import logging
import mlflow
import argparse


client = OpenAI(max_retries=5)

def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--crawled_path", type=str, help="path to crawled data")
    parser.add_argument("--max_tokens", type=int, default=8191)
    parser.add_argument("--n_top", type=int, default=1000, help="number of selected news to be processed")
    parser.add_argument("--emb_data", type=str, help="path to embedding data")
    parser.add_argument("--output_suffix", type=str, help="suffix of output data")
    args = parser.parse_args()

    crawled_path = args.crawled_path
    n_top = args.n_top  #1000
    output_suffix = args.output_suffix
    emb_data_path = args.emb_data

    # Start Logging
    mlflow.start_run()

    index_path = os.path.join(crawled_path, "rss_crawled_data_index.tsv")
    crawled_index = pd.read_csv(index_path, sep='\t', encoding="latin_1")

    crawled_data = []
    for input_suffix in crawled_index['suffix']:
        print(input_suffix)
        input_path = os.path.join(crawled_path, "rss_crawled_data_" + input_suffix + ".csv")
        crawled_data_cur = pd.read_csv(input_path, sep='\t', encoding="latin_1")
        crawled_data.append(crawled_data_cur)

    crawled_df = pd.concat(crawled_data, ignore_index=True)

    embedding_model = "text-embedding-3-small"
    embedding_encoding = "cl100k_base"
    max_tokens = args.max_tokens  #8191  # the maximum for text-embedding-3-small is 8191

    crawled_df['title_summary'] = crawled_df.apply(lambda row: f"{row['title']}; {row['summary']}", axis=1)

    crawled_df["title_summary"] = crawled_df["title_summary"].str.slice(0,max_tokens).to_frame()

    crawled_df = crawled_df.drop_duplicates(subset=["title_summary"],ignore_index=False)
    #select the top n news from corpus
    # crawled_df = crawled_df.tail(n_top)
    #tokenize the news
    encoding = tiktoken.get_encoding(embedding_encoding)
    
    crawled_df["n_tokens"] = crawled_df.title_summary.apply(lambda x: len(encoding.encode(x)))
    
    crawled_df["embedding"] = crawled_df.title_summary.apply(lambda x: get_embedding(x, model=embedding_model))

    # if not os.path.exists(emb_data_path):
    #     os.makedirs(emb_data_path)

    output_path = os.path.join(args.emb_data, "crawl_data_emb_" + output_suffix + ".csv")
    crawled_df.to_csv(output_path, sep = '\t', index=False)

    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

In [None]:
%%writefile {ml_models_src_dir}/embedding.yml
# <component>
name: embedding_news_defaults_model
display_name: embedding news Defaults Model
# version: 1 # Not specifying a version will automatically update the version
type: command
inputs:
  crawled_path:
    type: uri_folder
  max_tokens:
    type: number
  n_top:
    type: number
  output_suffix:
    type: string
outputs:
  emb_data:
    type: uri_folder
code: .
environment:
  azureml:linkedin-learn:1
command: >-
  python embedding.py 
  --crawled_path ${{inputs.crawled_path}} 
  --max_tokens ${{inputs.max_tokens}} 
  --n_top ${{inputs.n_top}}
  --emb_data ${{outputs.emb_data}}
  --output_suffix ${{inputs.output_suffix}}
# </component>

In [None]:
# Loading the component from the yml file
embedding_component = load_component(source=os.path.join(ml_models_src_dir, "embedding.yml"))

# Now we register the component to the workspace
embedding_component = ml_client.create_or_update(embedding_component)

# Create (register) the component in your workspace
print(
    f"Component {embedding_component.name} with Version {embedding_component.version} is registered"
)

In [None]:
%%writefile {ml_models_src_dir}/cluster.py
import argparse
from sklearn.random_projection import GaussianRandomProjection
import os
import pandas as pd
import mlflow
from ast import literal_eval
import pandas.core.strings as pd_strings  # Import for string manipulation
from collections import defaultdict
import numpy as np

def join_vector_as_string(vector, separator=""):
  """Joins a vector of integers as a string using the provided separator.
  Args:
    vector: A list or numpy array of integers.
    separator: The string to use between elements (default is comma).
  Returns:
    A string containing the joined elements.
  """
  string_vector = [str(v) for v in vector]  # Convert elements to strings using list comprehension
  joined_string = separator.join(string_vector)
  return joined_string

def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])

# Start Logging
mlflow.start_run()

# enable autologging
mlflow.sklearn.autolog()

def main():
    """Main function of the script."""

    # input and output arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--emb_data", type=str, help="path to embedding data")
    parser.add_argument("--input_suffix", type=str, help="suffix of the input data file")
    parser.add_argument("--n_components", required=False, default=25, type=int)
    parser.add_argument("--random_state", required=False, default=42, type=int)
    parser.add_argument("--cluster_data", type=str, help="path to group cluster")
    args = parser.parse_args()

    input_suffix = args.input_suffix

    input_path = os.path.join(args.emb_data, "crawl_data_emb_" + input_suffix + ".csv")
    emb_df = pd.read_csv(input_path, sep = '\t')
    
    emb_df["embedding"] = emb_df.embedding.apply(literal_eval).apply(np.array)
    emb = np.stack(emb_df["embedding"])

    transformer = GaussianRandomProjection(n_components=25, random_state=42)
    projected_embeddings = pd.DataFrame(transformer.fit_transform(emb))
    df_transformed = projected_embeddings.gt(0).astype(int)
    df_transformed['cluster_id'] = df_transformed.apply(lambda x: join_vector_as_string(x), axis=1)

    emb_df['cluster_id'] = df_transformed['cluster_id']

    output_path = os.path.join(args.cluster_data, "cluster_data_" + input_suffix + ".csv")
    emb_df.to_csv(output_path, sep='\t', index=False)

    # Stop Logging
    mlflow.end_run()

if __name__ == "__main__":
    main()

In [None]:
%%writefile {ml_models_src_dir}/cluster.yml
# <component>
name: cluster_news_defaults_model
display_name: cluster news Defaults Model
type: command
inputs:
  emb_data: 
    type: uri_folder
  n_components:
    type: number
  random_state:
    type: number
  input_suffix:
    type: string
outputs:
  cluster_data:
    type: uri_folder
code: .
environment:
  azureml:linkedin-learn:1
command: >-
  python cluster.py 
  --emb_data ${{inputs.emb_data}}
  --n_components ${{inputs.n_components}}
  --random_state ${{inputs.random_state}}
  --input_suffix ${{inputs.input_suffix}}
  --cluster_data ${{outputs.cluster_data}}
# </component>

In [None]:
# Loading the component from the yml file
cluster_component = load_component(source=os.path.join(ml_models_src_dir, "cluster.yml"))

# Now we register the component to the workspace
cluster_component = ml_client.create_or_update(cluster_component)

# Create (register) the component in your workspace
print(
    f"Component {cluster_component.name} with Version {cluster_component.version} is registered"
)

In [None]:
# %%writefile {ml_models_src_dir}/curator.py
# import argparse
# import os
# import pandas as pd
# from ast import literal_eval
# import mlflow
# import numpy as np
# from langchain.agents import Tool
# from crewai import Agent, Task, Process, Crew
# from langchain_community.utilities import GoogleSerperAPIWrapper
# from langchain_openai import OpenAI
# import json
# import smtplib
# from email.mime.text import MIMEText
# from email.mime.multipart import MIMEMultipart


# def send_email(recipient_email,subject = "Test Email from Python", message = "This is a test email sent using Python's smtplib library."):

#     # Create a secure connection with SMTP server (replace with your SMTP server details)
#     with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:

#         # Example usage (replace placeholders with your details)
#         server.login(sender_email, sender_password)
#         # Create a MIMEMultipart message for better formatting (optional, but recommended)
#         msg = MIMEMultipart()
#         msg['From'] = sender_email
#         msg['To'] = recipient_email
#         msg['Subject'] = subject

#         # Set message content as plain text
#         msg.attach(MIMEText(message, 'plain'))  # You can also use 'html' for HTML content

#         # Send the email
#         server.sendmail(sender_email, recipient_email, msg.as_string())
#         print('Email sent successfully!')

# def join_vector_as_string(vector, separator=""):
#   """Joins a vector of integers as a string using the provided separator.
#   Args:
#     vector: A list or numpy array of integers.
#     separator: The string to use between elements (default is comma).
#   Returns:
#     A string containing the joined elements.
#   """
#   string_vector = [str(v) for v in vector]  # Convert elements to strings using list comprehension
#   joined_string = separator.join(string_vector)
#   return joined_string

# def select_first_file(path):
#     """Selects first file in folder, use under assumption there is only one file in folder
#     Args:
#         path (str): path to directory or file to choose
#     Returns:
#         str: full path of selected file
#     """
#     files = os.listdir(path)
#     return os.path.join(path, files[0])


# def main():
#     """Main function of the script."""

#     # input and output arguments
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--clean_data", type=str, help="path to clean data")
#     parser.add_argument("--cluster_id", type=str, help="path to group cluster")
#     parser.add_argument("--curation", type=str, help="path to curated newsletters")
#     args = parser.parse_args()

#     search = GoogleSerperAPIWrapper()

#     search_tool = Tool(
#         name="Scrape google searches",
#         func=search.run,
#         description="useful for when you need to ask the agent to search the internet",
#     )

#     # To Load GPT-4 api
#     api = os.environ.get("OPENAI_API_KEY")

#     explorer = Agent(
#         role="web search",
#         goal="Find and explore the top news from internet when provided one or several new sources",
#         backstory="""You are and Expert strategist that knows how to spot other new sources having the same event or same topic with the provided new source. The provided new source will be given as a list of news by task description
#         You're great at finding same news with different or exclusive opinions, aspects or details.
#         """,
#         verbose=True,
#         allow_delegation=False,
#         tools=[search_tool],
#     )

#     writer = Agent(
#         role="Senior Technical Writer",
#         goal="Write a newsletter about the provided new source and searched other news",
#         backstory="""You are an Expert Writer on newsletter for the main topic or event of the provided new sources. Content of different news are selected or merged and takes advantage of all collected new sources in order to provide a broader view and more complete description and detail of the event or topic in most new sources. You know how to present complicated technical terms to general audience in a 
#         fun way by using layman words. If one of the new sources contains a topic or event unrelavent to the main topic or event in all new sources, do not include the unrelavent content in the generated newsletter. ONLY use other new sources from the internet and provided new source for the newsletter.""",
#         verbose=True,
#         allow_delegation=True,
#     )

#     criticizer = Agent(
#         role="Senior Writing criticizer",
#         goal="criticise the generated newsletter and see if any facts or stats of generated newsletter is incorrect",
#         backstory="""You are the expert of criticising generated newsletter to make sure the fact is true and logical based on the original provided news and search expansion. If any fact or statistics are not consistent with the original provided news and search expansion, correct these content in newsleter and output a corrected version. Comparing the newsletter and the original provided news and search expansion, see if there's any logical or statistical error and correct these fact error in newsletter to make sure the content consistency between newsletter and provided news.""",
#         verbose=True,
#         allow_delegation=True,    
#     )

#     clean_data = pd.read_csv(os.path.join(args.clean_data, "clean_data.csv"))
#     cluster_id = pd.read_csv(os.path.join(args.cluster_id, "cluster_id.csv"))
#     try:
#         cluster_id = cluster_id.cluster_id.apply(literal_eval).apply(np.array)
#     except:
#         cluster_id = cluster_id.cluster_id.apply(lambda x: [x])

#     ans = []
#     for i in cluster_id:
#         if len(i)>=1:
#             new = ""
#             for j in i:
#                 new+= "url of news: "+clean_data.iloc[j]["Current_URL"]+", "
#             task_search = Task(
#                 description="""search internet for the top news with the provided new source as follows"""+new,
#                 agent=explorer,
#                 expected_output="""
#                 For your Outputs use the following markdown format:
#                 ## [rank][Title of the new](link to new)
#                 - news content
#                 """,
#             )

#             task_newsletter = Task(
#                 description="""Write a newsletter with text only and with a short but impactful headline and at least 20 paragraphs.
#                 """,
#                 agent=writer,
#                 expected_output="""
#                 For your Outputs use the following markdown format:
#                 ## [Title of newsletter](link to project)
#                 - Interesting facts
#                 """,
#             )

#             task_revise = Task(
#                 description="""compare the generated newsletter and provided news, correct the corresponding content of newsletter and make sure there's no statistical, logical error and fact. 
#                 """,
#                 agent=criticizer,
#                 expected_output="""For your Outputs use the following markdown format:
#                 ## [Title of newsletter]
#                 - Interesting facts
#                 """
#             )

#             # instantiate crew of agents
#             crew = Crew(
#                 agents=[explorer, writer, criticizer],
#                 tasks=[task_search, task_newsletter, task_revise],
#                 verbose=True,
#                 process=Process.sequential,  # Sequential process will have tasks executed one after the other and the outcome of the previous one is passed as extra content into this next.
#                 planning=True,
#                 planning_llm=OpenAI(model="gpt-4o")
#             )
            
#             # Get your crew to work!
#             result = crew.kickoff()
#             send_email("wuyifu2f@gmail.com", result.raw.split("\n")[0] , result.raw)
#             ans.append(json.dumps(result.json_dict))

#     ans = pd.DataFrame(ans, columns=['curation'],dtype='string')
#     ans.to_csv(os.path.join(args.curation, "curation.csv"), index=False)

# if __name__ == "__main__":
#     main()

In [None]:
# %%writefile {ml_models_src_dir}/curator.yml
# # <component>
# name: curate_newsletter_defaults_model
# display_name: curate newsletter Defaults Model
# # version: 1 # Not specifying a version will automatically update the version
# type: command
# inputs:
#   clean_data: 
#     type: uri_folder
#   cluster_id:
#     type: uri_folder
# outputs:
#   curation: 
#     type: uri_folder
# code: .
# environment:
#   # for this step, we'll use an AzureML curate environment
#   # azureml://registries/azureml/environments/sklearn-1.0/labels/latest
#   # azureml://locations/eastus/workspaces/ca94b34d-7f1c-4e88-bc05-341b8e447df2/environments/newsletter-learn/versions/29
#   azureml:newsletter-learn:30
# command: >-
#   python curator.py 
#   --clean_data ${{inputs.clean_data}}
#   --cluster_id ${{inputs.cluster_id}} 
#   --curation ${{outputs.curation}}
# # </component>

In [None]:
# # Loading the component from the yml file
# curator_component = load_component(source=os.path.join(ml_models_src_dir, "curator.yml"))

# # Now we register the component to the workspace
# curator_component = ml_client.create_or_update(curator_component)

# # Create (register) the component in your workspace
# print(
#     f"Component {curator_component.name} with Version {curator_component.version} is registered"
# )

In [None]:
# the dsl decorator tells the sdk that we are defining an Azure Machine Learning pipeline
from azure.ai.ml import dsl, Input, Output
# url_folder = "azureml://subscriptions/541beb67-718e-41c5-958e-8cc0ba95b210/resourcegroups/awesome_rag_dev/workspaces/rag_book_demo/datastores/workspaceblobstore/paths/UI/2024-07-27_231028_UTC/"
url_folder = ml_client.data.get("newsletter_v1", version="1").path

emb_data = Output(type="uri_folder", path=url_folder,mode="rw_mount")
cluster_data = Output(type="uri_folder", path=url_folder,mode="rw_mount")


@dsl.pipeline(
    compute="serverless",  # "serverless" value runs pipeline on serverless compute
    description="newsletter curation pipeline",
)

def newsletter_defaults_pipeline(
    pipeline_job_data_input,
    pipeline_job_max_tokens,
    pipeline_job_n_top,
    pipeline_job_n_components,
    pipeline_job_random_state,
):
    output_suffix = datetime.now().strftime("%Y-%m-%dT%H")

    # using data_prep_function like a python call with its own inputs
    embedding_job = embedding_component(
        crawled_path=pipeline_job_data_input,
        max_tokens=pipeline_job_max_tokens,
        n_top=pipeline_job_n_top,
        output_suffix = output_suffix,
    )

    # using cluster_func like a python call with its own inputs
    cluster_job = cluster_component(
        emb_data=embedding_job.outputs.emb_data,  # note: using outputs from previous step
        n_components=pipeline_job_n_components,
        random_state=pipeline_job_random_state,
        input_suffix=output_suffix,
    )

    # example how to change path of output on pipeline level
    embedding_job.outputs.emb_data = emb_data
    cluster_job.outputs.cluster_data = cluster_data

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "JSON_FORMAT_curation": cluster_job.outputs.cluster_data,
    }

In [None]:
registered_model_name = "newsletter_defaults_model"

# get a handle of the data asset and print the URI
crawled_data = ml_client.data.get(name="rss_crawl", version="3")

print(f"Data asset URI: {crawled_data.path}")

# Let's instantiate the pipeline with the parameters of our choice
pipeline = newsletter_defaults_pipeline(
    pipeline_job_data_input=Input(type="uri_folder", path=crawled_data.path),
    pipeline_job_max_tokens=8191,
    pipeline_job_n_top=1000,
    pipeline_job_n_components=25,
    pipeline_job_random_state=42,
)

In [None]:
# # submit the pipeline job
# pipeline_job = ml_client.jobs.create_or_update(
#     pipeline,
#     # Project's name
#     experiment_name="linkedin-learn-exp-1",
# )
# ml_client.jobs.stream(pipeline_job.name)