In [13]:
# imports
import re
import time

from bs4 import NavigableString, Tag
from bs4 import BeautifulSoup
import markdown2
import hashlib
import shutil
import os

In [14]:
# constants
INPUT_FILEPATH = r"C:\Users\tianfy\Desktop\毕设\markdown\毕设.md"
OUTPUT_FILEPATH = INPUT_FILEPATH.replace(".md", ".txt")
PIC_OUTPUT_DIR = os.path.join(os.path.dirname(INPUT_FILEPATH), "figures")
MD5 = lambda x: hashlib.md5(x.encode("utf-8")).hexdigest()
if not os.path.exists(PIC_OUTPUT_DIR):
    os.makedirs(PIC_OUTPUT_DIR)

In [15]:
with open(INPUT_FILEPATH, "r", encoding="utf-8") as f:
    file = f.read()

pic_list = []
pre_render_code_blocks = {}
pre_render_inline_math = {}
char_statistic = 0
li_count = 0
def parse(tag) -> str:
    global li_count
    if isinstance(tag, NavigableString):
        return tag.string
    elif isinstance(tag, Tag):
        if tag.name == "p":
            string = ""
            try:
                for child in tag:
                    string += parse(child)
            except Exception as e:
                print(tag)
                raise e
            return f"{string}\n"
        elif tag.name == "strong":
            string = ""
            for child in tag:
                string += parse(child)
            return f"\\textbf{{{string}}}"
        elif tag.name == "h1":
            string = ""
            for child in tag:
                string += parse(child)
            return f"\\chapter{{{string}}}"
        elif tag.name == "h2":
            string = ""
            for child in tag:
                string += parse(child)
            return f"\\section{{{string}}}"
        elif tag.name == "h3":
            string = ""
            for child in tag:
                string += parse(child)
            return f"\\subsection{{{string}}}"
        elif tag.name == "ol":
            li_count = 1
            string = ""
            for child in tag:
                string += parse(child)
            # return f"\\begin{{enumerate}}[itemsep=0em,itemindent=3em, leftmargin=0em, topsep=0em]\n{string}\\end{{enumerate}}\n"
            return f"{string}"
        elif tag.name == "ul":
            li_count = -1
            string = ""
            for child in tag:
                string += parse(child)
            return f"\\begin{{itemize}}[itemsep=0em,itemindent=3em, leftmargin=0em, topsep=0em]\n{string}\\end{{itemize}}\n"
        elif tag.name == "li":
            string = ""
            for child in tag:
                string += parse(child)
            if li_count < 0:
                return f"\\item {string}\n"
            string = f"{li_count}. {string}\n"
            li_count += 1
            return string
        elif tag.name == "code":
            string = tag.string
            splitlines = string.splitlines()
            language = splitlines[0]
            caption = splitlines[1]
            style = None
            if caption.startswith("[") and caption.endswith("]"):
                caption = caption[1:-1]
                if "|" in caption:
                    caption, style = caption.split("|")
                content = "\n".join(splitlines[2:])
            else:
                content = "\n".join(splitlines[1:])
            attrs = {
            }
            if style is not None:
                attrs["style"] = f"style{style}"
            def attrs_2_str(attrs):
                return ",".join([f"{k}={v}" if v is not None else f"{k}" for k, v in attrs.items()])
            
            if language == "tex":
                string = content
            else:
                if language != "txt" and language:
                    attrs["language"] = language
                string = f"""
\\begin{{figure}}[!h]
\\centering
\\begin{{lstlisting}}[{attrs_2_str(attrs)}]\n{content}\n\\end{{lstlisting}}
\\caption{{{caption}}} % QAQ
\\label{{{caption}}}
\\end{{figure}}"""

            key = f"[{MD5(string)}]"
            pre_render_code_blocks[key] = string
            return key
        elif tag.name == "blockquote":
            return ""
        elif tag.name == "img":
            name = tag["alt"].split(".")[0]
            ext = tag["src"].split(".")[-1]
            key = f"{len(pic_list)}_{MD5(name)}"

            new_name = f"{key}.{ext}"
            pic_list.append((new_name, tag["src"]))
            lw = "" if "lw" not in tag.attrs else tag["lw"]
            template = rf"""
            %% autogen by script
            \begin{{figure}}[!htbp]
            \centering
            \{ 'includesvg' if ext == 'svg' else 'includegraphics' }[width={lw}\linewidth]{{figures/{key}.{ext}}}
            \caption{{{key}}}
            \label{{{key}}}
            \end{{figure}}"""
            template = "\n".join([x.strip() for x in template.splitlines()])
            return template
        elif tag.name == "em":
            # WARNING: this is not a good way to handle em
            return f"_{tag.string}_"
        elif tag.name == "pre":
            string = ""
            for child in tag:
                string += parse(child)
            return string
        else:
            print(f"unknown tag: {tag.name} {tag}")
            assert False

def pre_process(string: str) -> str:
    return string

def post_process(string: str) -> str:
    # newline popup
    def float_up_newline(s):
        black_list = ["\\item", "\\textbf"]
        white_list = ["\\","[" ,"%"]
        is_skip = lambda x: any([x.startswith(y) for y in white_list]) and not any([x.startswith(y) for y in black_list])
        lines = s.split('\n')
        i = 0
        while i < len(lines):
            if lines[i] == '\\newline':
                j = i
                while j > 0 and (not lines[j-1].strip() or is_skip(lines[j-1])):
                    lines[j], lines[j-1] = lines[j-1], lines[j]
                    j -= 1 
            i += 1
        return '\n'.join(lines)
    
    # # shift up \newline
    # string = float_up_newline(string)
    #
    # # remove duplicate \newline
    # for i in range(50):
    #     string = string.replace("\\newline", "\\newline")
        
    # remove first line \newline
    lines = string.split("\n")
    if lines[0] == "\\newline":
        lines = lines[1:]
    string = "\n".join(lines)
    
    # process figure
    string = re.sub(
        r"\\caption{(.*)}\n\\label{(.*)}([\s\S]*?)\n\[(.*)]",
        r"\\caption{\4}\n\\label{\4}\3",
        string,
        flags=re.MULTILINE)
    
    # process %
    magic_key = "y123j45h6gw7c8d9"
    string = string.replace("%%", magic_key)
    string = string.replace("%", "\\%")
    string = string.replace(magic_key, "%")
    
    # process codeblock
    for key, value in pre_render_code_blocks.items():
        string = string.replace(key, value)
    string = string.replace(r"\codespace",r"\vspace{3pt}")
        
    # process ext
    string = f"% generate time: {time.strftime('%Y-%m-%d %H:%M:%S')}\n" + string
    
    return string


file = pre_process(file)
html = markdown2.markdown(file)
soup = BeautifulSoup(html, "html.parser")
string = ""
for child in soup:
    string += parse(child)
string = post_process(string)

# move picture
for pic in pic_list:
    name, src = pic
    target = os.path.join(PIC_OUTPUT_DIR, name)
    
    # Check if target file exists and its size is the same as the source file
    if os.path.exists(target) and os.path.getsize(target) == os.path.getsize(src):
        continue    

    shutil.copyfile(src, target)


# output to inputfile directory
with open(OUTPUT_FILEPATH, "w", encoding="utf-8") as f:
    f.write(string)

def is_chinese(char):
    if '\u4e00' <= char <= '\u9fff':
        return True
    else:
        return False   
print(
    len(
        [x for x in string if is_chinese(x)]
    )
)

45069


In [16]:
html = markdown2.markdown(
    r"""
通过手动检查了Evosuite未能成功覆盖的分支（图中上方焦点方法）以及由两种方法生成的测试用例（图中下方测试用例），可以看到该焦点方法的主要功能是从一个完全限定的类名字符串中提取简短的类名，涉及到的情况有空字符处理、数组类型处理、基本类型缩写的处理等。Evosuite未覆盖到的分支已在图中标出，如果要进入这个分支，需要参数满足以下几个条件：

1. className 必须以字符“[”开始，这表示它是一个数组类型。这是进入外层的if条件所必须的。
2. 在移除了开头的“[”字符后，如果元素类型是引用数据类型，那么它的内部表示会以字符“L”开始，并以字符“;”结尾。这是 JVM 的内部表示方式，其中“L”表示后续字符是一个类的全限定名，“;”是这个名字的结尾。

所以，要满足进入这个if分支的条件，原始的className字符串应该是一个表示引用数据类型数组的内部表示。例如 “[Ljava.lang.String;”对应于String[]类型。

    """
)
html

'<p>通过手动检查了Evosuite未能成功覆盖的分支（图中上方焦点方法）以及由两种方法生成的测试用例（图中下方测试用例），可以看到该焦点方法的主要功能是从一个完全限定的类名字符串中提取简短的类名，涉及到的情况有空字符处理、数组类型处理、基本类型缩写的处理等。Evosuite未覆盖到的分支已在图中标出，如果要进入这个分支，需要参数满足以下几个条件：</p>\n\n<ol>\n<li>className 必须以字符“[”开始，这表示它是一个数组类型。这是进入外层的if条件所必须的。</li>\n<li>在移除了开头的“[”字符后，如果元素类型是引用数据类型，那么它的内部表示会以字符“L”开始，并以字符“;”结尾。这是 JVM 的内部表示方式，其中“L”表示后续字符是一个类的全限定名，“;”是这个名字的结尾。</li>\n</ol>\n\n<p>所以，要满足进入这个if分支的条件，原始的className字符串应该是一个表示引用数据类型数组的内部表示。例如 “[Ljava.lang.String;”对应于String[]类型。</p>\n'

In [17]:
import javalang
g = javalang.tokenizer.tokenize(r"""
public class ASD{
    public static void main(String[] args){
        System.out.println('{'); // awd 
    }
}

""")
list(g)

[Modifier "public" line 2, position 1,
 Keyword "class" line 2, position 8,
 Identifier "ASD" line 2, position 14,
 Separator "{" line 2, position 17,
 Modifier "public" line 3, position 5,
 Modifier "static" line 3, position 12,
 Keyword "void" line 3, position 19,
 Identifier "main" line 3, position 24,
 Separator "(" line 3, position 28,
 Identifier "String" line 3, position 29,
 Separator "[" line 3, position 35,
 Separator "]" line 3, position 36,
 Identifier "args" line 3, position 38,
 Separator ")" line 3, position 42,
 Separator "{" line 3, position 43,
 Identifier "System" line 4, position 9,
 Separator "." line 4, position 15,
 Identifier "out" line 4, position 16,
 Separator "." line 4, position 19,
 Identifier "println" line 4, position 20,
 Separator "(" line 4, position 27,
 String "'{'" line 4, position 28,
 Separator ")" line 4, position 31,
 Separator ";" line 4, position 32,
 Separator "}" line 5, position 5,
 Separator "}" line 6, position 1]

In [18]:
def find_quick_rises(prices):
    from collections import defaultdict, deque

    # 存储每分钟的价格
    price_by_minute = defaultdict(list)
    for minute, price in prices:
        price_by_minute[minute].append(price)

    # 使用一个deque来存储最近三分钟的价格的最小值
    min_prices = deque()
    # 存储最近三分钟的价格
    recent_prices = deque()

    current_min = float('inf')
    outputs = []

    # 从第一分钟开始处理到最后一分钟
    last_minute = max(price_by_minute.keys())
    for minute in range(1, last_minute + 1):
        if minute not in price_by_minute:
            # 没有这一分钟的数据，按照连续假设，这种情况应该不会发生
            continue

        # 当前分钟的价格列表
        current_prices = price_by_minute[minute]

        # 更新最近三分钟的价格信息
        recent_prices.extend(current_prices)
        current_min = min(current_min, min(current_prices))

        # 维护长度为3的窗口（对应时间为minute-2到minute）
        if len(recent_prices) > 3:
            # 移除超出三分钟的价格
            old_prices = price_by_minute[minute - 3]
            for old_price in old_prices:
                recent_prices.popleft()
                if old_price == current_min:
                    # 如果移除的价格是当前最小值，需要重新计算最小值
                    current_min = min(recent_prices)

        # 检查是否有价格比前三分钟的最小价格至少高出5
        rise_detected = 'N'
        if minute > 2:  # 从第3分钟开始才有三分钟的数据
            for price in current_prices:
                if price >= current_min + 5:
                    rise_detected = 'Y'
                    break

        # 记录结果
        outputs.append(f"{minute} {rise_detected}")

    return outputs

# 输入数据
prices = [
    (1, 1),
    (2, 5),
    (2, 4),
    (2, 5),
    (3, 7),
    (3, 5),
    (3, 4),
    (4, 4),
    (5, 7),
    (5, 4)
]

# 执行函数
results = find_quick_rises(prices)
for result in results:
    print(result)


1 N
2 N
3 Y
4 N
5 N
