In [1]:
import time
import warnings
warnings.simplefilter("ignore")

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
from sentence_transformers import SentenceTransformer
import faiss

from sklearn.neighbors import NearestNeighbors

from newspaper import Article
from sklearn.cluster import KMeans
import numpy as np
#from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [4]:
import re

In [5]:
#import markdown

In [6]:
from collections import defaultdict

# sentence-transformersのモデルを選択

In [7]:
 
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

### これより上のセルは事前に読み込んでおく

In [8]:
def news_fetcher(arg_url):
    article = Article(arg_url)
    article.download()
    article.parse()
    return article.text

In [9]:
url_list000 = ['https://www.federalreserve.gov/newsevents/pressreleases/monetary20231101a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20231213a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240131a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240320a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240501a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240612a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240731a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20240918a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20241107a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20241218a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20250129a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20250319a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20250507a.htm',
            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20250618a.htm'
            ]

#url_list = ['https://www.federalreserve.gov/newsevents/pressreleases/monetary20190130a.htm',
#            'https://www.federalreserve.gov/newsevents/pressreleases/monetary20190320a.htm']


In [10]:
url_list=url_list000[-2:]
url_list

['https://www.federalreserve.gov/newsevents/pressreleases/monetary20250507a.htm',
 'https://www.federalreserve.gov/newsevents/pressreleases/monetary20250618a.htm']

### FOMC当日　14:00

In [11]:
start = time.time()

In [12]:
notes = []
for url in url_list:
    txt = news_fetcher(url)
        
    txt = txt.replace('Voting for','***Voting for').split('***')
    txt[1] = txt[1].replace('.',' ')
    txt = txt[0]+txt[1]
    
    txt = re.sub(r'\n+','\n',txt)
    txt = txt.replace('U.S.','US').replace('.\n','.****').replace('\n','****').replace('. ','.****').split('****')
    txt = [s for s in txt if not s == '']
    notes += [txt]

In [13]:
old = notes[0]
new = notes[1]

In [14]:
old

['Although swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace.',
 'The unemployment rate has stabilized at a low level in recent months, and labor market conditions remain solid.',
 'Inflation remains somewhat elevated.',
 'The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run.',
 'Uncertainty about the economic outlook has increased further.',
 'The Committee is attentive to the risks to both sides of its dual mandate and judges that the risks of higher unemployment and higher inflation have risen.',
 'In support of its goals, the Committee decided to maintain the target range for the federal funds rate at 4-1/4 to 4-1/2 percent.',
 'In considering the extent and timing of additional adjustments to the target range for the federal funds rate, the Committee will carefully assess incoming data, the evolving outlook, and the balance of risks.',
 'The 

In [15]:
new

['Although swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace.',
 'The unemployment rate remains low, and labor market conditions remain solid.',
 'Inflation remains somewhat elevated.',
 'The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run.',
 'Uncertainty about the economic outlook has diminished but remains elevated.',
 'The Committee is attentive to the risks to both sides of its dual mandate.',
 'In support of its goals, the Committee decided to maintain the target range for the federal funds rate at 4-1/4 to 4-1/2 percent.',
 'In considering the extent and timing of additional adjustments to the target range for the federal funds rate, the Committee will carefully assess incoming data, the evolving outlook, and the balance of risks.',
 'The Committee will continue reducing its holdings of Treasury securities and agency debt and agency mortga

# sentence-transformersを使用したベクトル化:
sentence-transformersを使用して各文をベクトルに変換します。これにより、文の意味を考慮した表現が得られます。

In [16]:
# ベクトル生成（旧文書）
vectors = model.encode(old)
vectors = np.array(vectors, dtype='float32')

# インデックス化
K = len(vectors)
nn = NearestNeighbors(n_neighbors=K, metric='euclidean')  # または 'cosine'
nn.fit(vectors)

# 新文書ベクトル
ref_vector = model.encode(new)
ref_vector = np.array(ref_vector, dtype='float32')

if ref_vector.ndim == 1:
    ref_vector = ref_vector.reshape(1, -1)

# 検索
D, I = nn.kneighbors(ref_vector)

In [17]:
D.shape

(16, 16)

# Faissによる検索

In [18]:
# 前回の声明文
#vectors = model.encode(old)
#dim = len(vectors[0])

In [19]:
# インデックス化
#index = faiss.IndexFlatL2(dim)
#index.add(vectors)
#K = len(vectors)
#K

In [20]:
#今回の声明文
#ref_vector = model.encode(new)

# 必ずfloat32に変換
#ref_vector = np.array(ref_vector, dtype='float32')


#if ref_vector.ndim == 1:
#    ref_vector = ref_vector.reshape(1, -1)

#len(ref_vector)

In [21]:
#print("ref_vector.shape:", ref_vector.shape)
#print("ref_vector.dtype:", ref_vector.dtype)
#print("index.ntotal:", index.ntotal)
#print("K:", K)

In [22]:
# Faissによる検索
#D, I = index.search(ref_vector,K) # 行が新、列が旧
#D.shape

### 最短近傍の重複がないことを確認

In [23]:
def find_duplicate_pairs(lst):
    # 重複したペアのインデックスを格納するディクショナリ
    duplicate_indexes = defaultdict(list)

    # ペアをディクショナリに格納
    pair_dict = defaultdict(list)
    for i, value in enumerate(lst):
        pair_dict[value].append(i)

    # 重複したペアのインデックスを抽出
    for key, indexes in pair_dict.items():
        if len(indexes) > 1:
            duplicate_indexes[key] = indexes

    return duplicate_indexes

In [24]:

idx = I[:,0].tolist()

chk = find_duplicate_pairs(idx)
print(chk)

if chk != {}:
    for k,v in chk.items(): #k 重複している旧声明文の文番号、　lst 新声明文の文番号のリスト
        for i in v:
            idx[i]=''
        win = np.argmin(D[v,0])
        idx[v[win]]=k

defaultdict(<class 'list'>, {})


### ペアになれなかった旧声明文の文番号

In [25]:
txt_remain = []
chk = [s for s in idx if not s=='']
for s in list(range(len(old))):
    if s not in chk:
        txt_remain += [s]
txt_remain

[]

In [26]:
def group_consecutive_numbers(lst):
    result = []
    current_group = []

    for number in lst:
        if not current_group or number == current_group[-1] + 1:
            current_group.append(number)
        else:
            result.append(current_group)
            current_group = [number]

    if current_group:
        result.append(current_group)

    return result

In [27]:
len(old)

16

In [28]:
pairs = group_consecutive_numbers(txt_remain)
pairs

[]

In [29]:
if pairs != []:pairs[0][0]

In [30]:
pairs_dic = {}
for pair in pairs:
    pairs_dic[pair[0]-1]=pair
pairs_dic

{}

In [31]:
lookup =[(i,s1,s2) for i,s1,s2 in zip(list(range(D.shape[0])),D[:,0].tolist(),I[:,0].tolist())]
locked = [(i,s1,s2) for i,s1,s2 in lookup if s1 == 0.0]
used = [s2 for i,s1,s2 in lookup if s1 == 0.0]
#[print(i,s1,s2) for i,s1,s2 in zip(list(range(D.shape[0])),D[:,0].tolist(),I[:,0].tolist())]
#[print(i,s1,s2) for i,s1,s2 in lookup if s1 == 0.0]
locked = locked+[(i,s1,s2) for i,s1,s2 in lookup if s2 not in used]
locked = sorted(locked, key=lambda x: x[0])
chk1 = []
for i in range(len(locked)):
    try:
        if locked[i][0]+1==locked[i+1][0]:
            chk1 += [(locked[i][0],[locked[i][-1]]+pairs_dic[locked[i][-1]])]
        else:
            chk1 += [(locked[i][0],[locked[i][-1]])]
    except:
        chk1 += [(locked[i][0],[locked[i][-1]])]
[print(s) for s in chk1]

(0, [0])
(1, [1])
(2, [2])
(3, [3])
(4, [4])
(5, [5])
(6, [6])
(7, [7])
(8, [8])
(9, [9])
(10, [10])
(11, [11])
(12, [12])
(13, [13])
(14, [14])
(15, [15])


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [32]:
def flatten_list(nested_list):
    """入れ子になったリストをフラット化する関数"""
    flat_list = []
    for item in nested_list:
        if isinstance(item, list):
            # 再帰的にリストをフラット化
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

In [33]:
unused_new = [i for i in range(D.shape[0]) if not i in [s[0] for s in chk1]]
print(unused_new)
unused_old = [i for i in range(D.shape[1]) if not i in flatten_list([s[1] for s in chk1])]
print(unused_old)

[]
[]


In [34]:
if (unused_old != [])&(unused_new != []):
    # 前回の声明文
    old1 = [old[i] for i in unused_old]
    vectors1 = model.encode(old1)
    dim = len(vectors1[0])
    # インデックス化
    index = faiss.IndexFlatL2(dim)
    index.add(vectors1)
    K = len(vectors1)

    #今回の声明文
    new1 = [new[i] for i in unused_new]
    ref_vector1 = model.encode(new1)
    len(ref_vector1)
    # Faissによる検索
    D1, I1 = index.search(ref_vector1,K) # 行が新、列が旧
    new_pairs = [(s0,s1,unused_old[s2]) for s0,s1,s2 in zip(unused_new,D1[:,0].tolist(),I1[:,0].tolist())]
    [print(s0,s1,unused_old[s2]) for s0,s1,s2 in zip(unused_new,D1[:,0].tolist(),I1[:,0].tolist())]

    new_pairs = sorted(new_pairs , key=lambda x: x[1])
    temp = []
    for s in new_pairs:
        if s[-1] not in temp:
            chk1 += [(s[0],[s[-1]])]
            temp += [s[-1]]
        else:
            chk1 += [(s[0],[''])]

In [35]:

chk1 = sorted(chk1,key=lambda x: x[0])
chk1

[(0, [0]),
 (1, [1]),
 (2, [2]),
 (3, [3]),
 (4, [4]),
 (5, [5]),
 (6, [6]),
 (7, [7]),
 (8, [8]),
 (9, [9]),
 (10, [10]),
 (11, [11]),
 (12, [12]),
 (13, [13]),
 (14, [14]),
 (15, [15])]

In [36]:
chk1

[(0, [0]),
 (1, [1]),
 (2, [2]),
 (3, [3]),
 (4, [4]),
 (5, [5]),
 (6, [6]),
 (7, [7]),
 (8, [8]),
 (9, [9]),
 (10, [10]),
 (11, [11]),
 (12, [12]),
 (13, [13]),
 (14, [14]),
 (15, [15])]

In [37]:
old1 = []
for s in chk1:
    ll = s[1]
    if not ll[0] == '':
        temp =''
        for i in ll:
            temp += old[i]
        old1 +=[temp]
    else:
        old1 += ['']
len(old1)

16

In [38]:
old1

['Although swings in net exports have affected the data, recent indicators suggest that economic activity has continued to expand at a solid pace.',
 'The unemployment rate has stabilized at a low level in recent months, and labor market conditions remain solid.',
 'Inflation remains somewhat elevated.',
 'The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run.',
 'Uncertainty about the economic outlook has increased further.',
 'The Committee is attentive to the risks to both sides of its dual mandate and judges that the risks of higher unemployment and higher inflation have risen.',
 'In support of its goals, the Committee decided to maintain the target range for the federal funds rate at 4-1/4 to 4-1/2 percent.',
 'In considering the extent and timing of additional adjustments to the target range for the federal funds rate, the Committee will carefully assess incoming data, the evolving outlook, and the balance of risks.',
 'The 

### メモリの解放

In [39]:
#del model, vectors, ref_vector

### 新旧の声明文を文単位で比較し、単語の相違を抽出

In [40]:

red_ita = '<span style="color:red;font-style:italic;">'
red_del = '<span style="color:red; text-decoration:line-through;">'

def matching(txt1,txt2):
    txt1 = txt1.split(' ')
    txt2 = txt2.split(' ')
    for i in range(len(txt1)):
        s = txt1[i]
        if s not in txt2:
            txt1[i] = red_ita + s + "</span>"
    for i in range(len(txt2)):
        s = txt2[i]
        if s not in txt1:
            txt2[i] = red_del + s + "</span>"

    txt1 = ' '.join(txt1).replace('</span> '+red_ita,' ')
    txt2 = ' '.join(txt2).replace('</span> '+red_del,' ')
    
    return txt1,txt2

In [41]:
len(new)

16

In [42]:
len(old)

16

In [43]:
len(old1)

16

In [44]:
len(notes)

2

In [45]:
    
pairs = []
for s1,s2 in zip(new,old1):
    if not s2 == '':
        if s1 == s2:
            pairs += ['<tr><td>'+s1+'</td><td>'+'UNCHANGED'+'</td></tr>']
        else:
            s1,s2 = matching(s1,s2)
            pairs += ['<tr><td>'+s1+'</td><td>'+s2+'</td></tr>']
    else:
        pairs += ['<tr><td>'+red_ita+s1+'</span>'+'</td><td>'+'NEW TEXT'+'</td></tr>'] 

#if not txt_remain == []:
#    for i in txt_remain:
#        s2 = old[i]
#        pairs += ['<tr><td></td><td>'+s2+'</td></tr>']

html = ''.join(pairs)

new_date = re.findall(r'[0-9]{8}',url_list[-1])[0]
old_date = re.findall(r'[0-9]{8}',url_list[-2])[0]

html = '<tr><th>Latest Statement<br>'+new_date+'</th><th>Prior Statement<br>'+old_date+'</th></tr>'+html

html = '''
<head>
    <style>.valign td {vertical-align: top;} table, td, th { border: 2px #2b2b2b solid; } 
    </style>
</head>
    <body>
    <h1 style="font-size: 1.6rem;font-family: Arial, Helvetica, sans-serif;">
    <table class="valign">
        <colgroup>
            <col style="width: 60%;">
            <col style="width: 40%;">
        </colgroup>
    '''+html+'</table></h1></body>'

with open('html/'+new_date+'_statement.html',encoding='utf-8',mode='w') as f:
    f.write(html)

with open('txt/'+new_date+'_statement.txt',encoding='utf-8',mode='w') as f:
    f.write('###'.join(new))
with open('txt/'+new_date+'_statement_prior.txt',encoding='utf-8',mode='w') as f:
    f.write('###'.join(old1))

In [46]:
print(time.time()-start)

1.5389950275421143


In [47]:
data = ''
