# 使用情境

你現在是一個作文批改老師，你想要將 Practice_1 中所建立的作文評分模型與文章讀取系統做結合，已達到自動化批改作文的效果，並且希望藉由此系統，來檢測同學的作文有無互相抄襲

input file: './article.txt'

輸出格式：

    1. 作文的分數
    2. 是否有互相抄襲的文章？ 若有，請列出哪些互為抄襲的文章

你應該要

    1. Create loader to load articles
    2. Use splitter to seperate articles
    3. Define schema
    4. Build output parser
    5. Define input templetes
    6. Feed your agent with articles
    7. Use vector store to check similarity

In [2]:
# ======================================================
# Create loader to load articles
# ======================================================

import langchain_setup
from langchain.document_loaders import TextLoader

markdown_document = TextLoader('./article.txt').load()

In [3]:
# ======================================================
# Use splitter to seperate articles
# ======================================================

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 550,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)
articles = text_splitter.split_text(markdown_document[0].page_content)

In [4]:
from langchain.docstore.document import Document

all_docs = []
for article in articles:
    all_docs.append(Document(page_content=article))

In [16]:
from typing import Literal
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, AIMessage, HumanMessage
from langchain_setup import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_setup import pprint_documents

scores = []
threshold = 0.95

for i, article in enumerate(articles):        
    # ======================================================
    # Define schema,validation and postprocessing
    # ======================================================
    response_schemas = [
        ResponseSchema(name="切題", description="文章內容與題目的相關程度。介於 1 到 10。", type='integer'),
        ResponseSchema(name="文筆", description="文字遣詞是否精練，以及其內容深度。介於 1 到 10。", type='integer'),
        ResponseSchema(name="創意", description="文章發想內容的創意程度。介於 1 到 10。", type='integer'),
        ResponseSchema(name="通順", description="文章的通順程度。介於 1 到 10。", type='integer'),
    ]
    
    # ======================================================
    # Build output parser
    # ======================================================
    structure_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    
    # ======================================================
    # Define input templetes
    # ======================================================
    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", "你是一名國文教師，你現在要批改學生的作文，題目為朋友，評分標準需為以下四項：切題、文筆、創意、通順。"),
            ("human", "以下為作文：{article}"),
            ("human", "請為上述作文評分請依照以下指示回答\n{format_instructions}\n。"),
        ],
    )

    prompt_template2 = prompt_template.partial(format_instructions=structure_parser.get_format_instructions())

    # ======================================================
    # Feeds article
    # ======================================================
    messages = prompt_template2.format_messages(article = article)
    model = ChatOpenAI(temperature=0)
    response_message = model(messages)
    print(structure_parser.parse(response_message.content))
    
    # ======================================================
    # Use vector store to check similarity
    # ======================================================
    docs = all_docs[:i] + all_docs[i+1:]
    vectorstore = Qdrant.from_documents(
        docs,
        embedding=OpenAIEmbeddings(),
        location=":memory:",
    )
    result = vectorstore.similarity_search_with_relevance_scores(article, k=100)
    for document, score in result:
        scores.append(score)
# 檢查列表中是否有元素大於3
check = any(score > threshold for score in scores)

print("有抄襲" if check else "沒有抄襲")

{'切題': 8, '文筆': 7, '創意': 6, '通順': 9}
{'切題': 10, '文筆': 8, '創意': 7, '通順': 9}
{'切題': 8, '文筆': 7, '創意': 6, '通順': 9}
有抄襲
