In [1]:
import pandas as pd

df_news_all = pd.read_parquet(
    "datas/要分析的新聞/HF_ashraq_Reuters_dowjones_S&P_wash.parquet"
)#.iloc[:20]
len(df_news_all) #25344

2541

In [None]:
from huggingface_hub import InferenceClient

def check_first_line_contains_hallucinated(text):
    # Split the string into lines and get the first line
    try:
        first_line = text.splitlines()[0] if text else ""
    except:
        first_line = "Hallucinated"
    # Check if "Hallucinated" is in the first line (case-sensitive)
    return "Hallucinated" in first_line


def analyze_hallucination(text, token):
    client = InferenceClient(api_key=token)  # "hf_wp")

    messages = [
        {
            "role": "system",
            "content": """**Prompt for Recognizing Hallucination in Text:**

            You are a fact-checking expert tasked with identifying whether the following paragraph contains fabricated or unverifiable information. Analyze the paragraph systematically and follow these steps:

            1. **Check for Hallucination**:
            - Determine if the claims are verifiable based on general knowledge and logical consistency.
            - Identify any fabricated data, implausible claims, or contradictions within the text.

            2. **Provide Evidence**:
            - Highlight specific portions of the text that appear fabricated or unverifiable.
            - Briefly explain why these portions are questionable.

            3. **Classify the Text**:
            - Label the text as 'Hallucinated,' 'Partially correct,' or 'Factually Accurate.'

            4. **Confidence Score**:
            - Assign a confidence score (0–100%) indicating your certainty in the classification.

            **Input Paragraph**:
            "{{Insert Paragraph Here}}"

            **Output Format**:
            - **Classification**: [Hallucinated/Partially correct/Factually Accurate]
            - **Evidence**: [Explanation of the assessment with specific details.]
            - **Confidence Score**: [e.g., 85%]
            """,
        },
        {"role": "user", "content": text},
    ]

    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct",
        messages=messages,
        max_tokens=600,
        temperature=0.3,  # low temperature可降低隨機性
        top_p=0.8,  # 選擇較高的top_p（如 0.9）可以平衡內容的多樣性
        # frequency_penalty=0.3,  # 低frequency_penalty（如 0）：適合總體經濟分析，避免過度懲罰重複用詞，因為某些關鍵詞可能頻繁出現（如「GDP」、「利率」、「股價」等）
        # presence_penalty=0.1,  # 中低presence_penalty（如 0.2）：可以鼓勵模型在經濟分析中嘗試引入新的觀點或上下文，而不會完全脫離主題
    )

    response = completion.choices[0].message.content
    # response = completion.choices[0].message.content.split("```")[0]
    # 處理模型回應
    try:
        hallucinate_check = check_first_line_contains_hallucinated(response)
        # print(f"是否幻覺：{hallucinate_check}")
        return (hallucinate_check, response)
    except ValueError as e:
        print(f"處理錯誤：{e}")
        # 只要有異常就重來
        return (True, response)

In [None]:
from huggingface_hub import InferenceClient
import re


# 從回應中提取最後一個浮點數
def extract_float_from_response(response_text):
    # 使用正則表達式提取所有數字
    matches = re.findall(r"-?\d+\.?\d*", response_text)
    if matches:
        # 轉換最後一個匹配為浮點數
        value = float(matches[-1])
        # 驗證是否在範圍內
        if -1 <= value <= 1:
            return value
        else:
            # raise ValueError(f"數值超出範圍：{value}")
            print("數值超出範圍")
            raise ValueError(f"數值超出範圍：{value}")
            # return None
    else:
        # raise ValueError("無法解析出浮點數")
        print("無法解析出浮點數")
        raise ValueError("無法解析出浮點數")
        # return None


def analyze_dow_jones_sentiment(text, token, recursion_count=0):
    if recursion_count>3:
        return (0, "幻覺太多")
    else:
        client = InferenceClient(api_key=token)#"hf_wpR")

        messages = [
            {
                "role": "system",
                "content": """You are an expert of Financial Analyzing. Analyze the input news article to assess its potential impact or recent trend on the Dow Jones Index and S&P500 over the next few days. Consider economic and market factors such as:  
                1. rising interest rates  
                2. capital inflows into safe-haven assets  
                3. investor risk aversion  
                4. market uncertainty  
                5. any other reasons you think may let stock index fall

                Your analysis should include:  
                1. Identification of key events or statements in the news that are directly related to the stock market or broader economic trends.  
                2. An objective assessment of how these events are likely to influence investor behavior and market dynamics.  
                3. A sentiment score ranging from -1 to 1:  
                - `-1`: Strong pessimism about the Dow Jones Index trend.  
                - `1`: Strong optimism about the Dow Jones Index trend.  
                - `0`: The news is unrelated to or has no direct impact or trend on the overall economic trend.  
                4. If the score is outside the range of `-0.05` to `0.05`, indicates the event is highly correlated with the stock market trend.

                Format your response as follows:  
                - Provide a detailed analysis of the news, highlighting relevant points and reasoning.  
                - strictly adhere to conclude with the sentiment score on the last line in this format: **Sentiment Score: [value]**
                """,
            },
            {"role": "user", "content": text},
        ]

        completion = client.chat.completions.create(
            model="meta-llama/Llama-3.3-70B-Instruct",
            messages=messages,
            max_tokens=600,
            temperature=0.3,  # low temperature可降低隨機性
            top_p=0.8,  # 選擇較高的top_p（如 0.9）可以平衡內容的多樣性
            # frequency_penalty=0.3,  # 低frequency_penalty（如 0）：適合總體經濟分析，避免過度懲罰重複用詞，因為某些關鍵詞可能頻繁出現（如「GDP」、「利率」、「股價」等）
            # presence_penalty=0.1,  # 中低presence_penalty（如 0.2）：可以鼓勵模型在經濟分析中嘗試引入新的觀點或上下文，而不會完全脫離主題
        )

        response_sentiment = completion.choices[0].message.content
        # response_sentiment = completion.choices[0].message.content.split("```")[0]
        (hallucinate_check, response_hallucination) = analyze_hallucination(response_sentiment,token)
        if hallucinate_check:
            print(f"有幻覺就重來,recursion_count={recursion_count+1}")
            analyze_dow_jones_sentiment(text[recursion_count+2:],token, recursion_count + 1)
        else:
            # print(response)
            # 沒有幻覺就處理模型回應
            try:
                result = extract_float_from_response(response_sentiment)
                # print(f"模型輸出的情緒分數：{result}")
                # print("成功")
                return (result, response_sentiment)
            except:
                try:
                    print(f"格式錯誤就重來,recursion_count={recursion_count+1}")
                    analyze_dow_jones_sentiment(
                        text[recursion_count+1:], token,recursion_count + 1
                    )
                except ValueError as e:
                    # print(f"處理錯誤：{e}")
                    print(f"重試依然錯誤：{e}")
                    return (0, f"格式錯誤:{e}")
                    # return (0, f"格式錯誤:{e}")

In [4]:
# from tqdm import tqdm
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import time


def extract_sentiment_data_safe(text, token, recursion_count=0):
    if recursion_count>3:
        print("回應錯誤太多次 跳過")
        return {
            "llama_score": np.nan,
            "llama_reason": np.nan,
        }
    else:
        try:
            (result, response) = analyze_dow_jones_sentiment(text,token)
            return {
                "llama_score": result,
                "llama_reason": response,
            }
        except Exception as e:
            # 捕捉錯誤並回傳 NaN
            try:
                print(e)
                print("回應異常 等待10秒再來一次")
                time.sleep(10)
                print(f"回應異常就重來,recursion_count={recursion_count+1}")
                extract_sentiment_data_safe(
                    text[: -(recursion_count+1)], token, recursion_count + 1
                )
            except Exception as e:
                print("thread llama出現錯誤 跳過")
                print(e)
                return {
                    "llama_score": np.nan,
                    "llama_reason": np.nan,
                }


def thread_llama(row_start, row_end, token,file_name):
    # 初始化一個 DataFrame 用來存放結果

    df_news_5K = df_news_all.iloc[row_start:row_end]
    # 初始化一個 DataFrame 用來存放結果
    results = {
        "llama_score": [],
        "llama_reason": [],
    }

    # 使用 tqdm 包裝 DataFrame 的列迭代器
    for text in tqdm(df_news_5K["title"], desc="Processing rows"):
        sentiment_data = extract_sentiment_data_safe(text,token)
        try:
            results["llama_score"].append(sentiment_data["llama_score"])
            results["llama_reason"].append(sentiment_data["llama_reason"])
        except Exception as e:
            print("頂層for loop異常")
            print(e)
            results["llama_score"].append(np.nan)
            results["llama_reason"].append(np.nan)

    # 將結果轉換為 DataFrame 並合併到原始 DataFrame
    df_results = pd.DataFrame(results)
    df_news_5K = pd.concat(
        [
            df_news_5K[["title", "text", "url", "date"]].reset_index(drop=True),
            df_results,
        ],
        axis=1,
    )
    df_news_5K.to_parquet(file_name)
    print("5K完成")
    return 

In [5]:
list(range(0, len(df_news_all), len(df_news_all) // 5))


[0, 508, 1016, 1524, 2032, 2540]

In [None]:
import threading
import time

# 定義線程
row_start_list = list(range(0, len(df_news_all), len(df_news_all)//5))
token_list = [
    "hf_wp",
    "hf_czV",
    "hf_xm",
    "hf_DgH",
    "hf_WH",
]
file_name_list = [
    "datas/分析完成的新聞/HF_ashraq_Reuters_headline_V4_tk2_0-500.parquet",
    "datas/分析完成的新聞/HF_ashraq_Reuters_headline_V4_tk3_500-1000.parquet",
    "datas/分析完成的新聞/HF_ashraq_Reuters_headline_V4_tk4_1000-1500.parquet",
    "datas/分析完成的新聞/HF_ashraq_Reuters_headline_V4_tk5_1500-2000.parquet",
    "datas/分析完成的新聞/HF_ashraq_Reuters_headline_V4_tk6_2000-2500.parquet",
]
t_list = []

t1 = threading.Thread(
    target=thread_llama,
    args=(row_start_list[0], row_start_list[1], token_list[0], file_name_list[0]),
)
t_list.append(t1)
t2 = threading.Thread(
    target=thread_llama,
    args=(row_start_list[1], row_start_list[2], token_list[1], file_name_list[1]),
)
t_list.append(t2)
t3 = threading.Thread(
    target=thread_llama,
    args=(row_start_list[2], row_start_list[3], token_list[2], file_name_list[2]),
)
t_list.append(t3)
t4 = threading.Thread(
    target=thread_llama,
    args=(row_start_list[3], row_start_list[4], token_list[3], file_name_list[3]),
)
t_list.append(t4)
t5 = threading.Thread(
    target=thread_llama,
    args=(row_start_list[4], len(df_news_all), token_list[4], file_name_list[4]),
)
t_list.append(t5)
# 開始工作
for t in t_list:
    t.start()

# 調整多程順序
for t in t_list:
    t.join()

Processing rows:   0%|          | 0/508 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/508 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/508 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/508 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/509 [00:00<?, ?it/s]

有幻覺就重來,recursion_count=1
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=1
頂層for loop異常
'NoneType' object is not subscriptable
有幻覺就重來,recursion_count=1
有幻覺就重來,recursion_count=1
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=1
頂層for loop異常
'NoneType' object is not subscriptable
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=1
數值超出範圍
格式錯誤就重來,recursion_count=1
有幻覺就重來,recursion_count=1
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=2
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=1
頂層for loop異常
'NoneType' object is not subscriptable
頂層for loop異常
'NoneType' object is not subscriptable
有幻覺就重來,recursion_count=1
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=1
有幻覺就重來,recursion_count=1
cannot unpack non-iterable NoneType object
回應異常 等待10秒再來一次
回應異常就重來,recursion_count=2
頂層for loop異常
'NoneTyp

对于 llama-3–70b-instruct 模型，您每 5 秒最多可以发出 20 个请求，每分钟最多可以发出 60 个请求，每小时最多可以发出 600 个请求

令牌速率限制为每分钟 40,000 个令牌，每 10 分钟最多可以发出 160,000 个令牌。 llama-3–8b-instruct 模型具有类似的限制，令牌速率限制为每 10 秒 16,000 个令牌，每分钟 160,000 个令牌，每 10 分钟 512,000 个令牌。