In [5]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']
queries = ['A man is eating pasta.']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    print(util.pytorch_cos_sim(query_embedding, corpus_embeddings))
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: %.4f)" % (score))

tensor([[0.7096, 0.6074, 0.0180, 0.3360, 0.2378, 0.0270, 0.3069, 0.1378, 0.1920]])




Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7096)
A man is eating a piece of bread. (Score: 0.6074)
A man is riding a horse. (Score: 0.3360)
A man is riding a white horse on an enclosed ground. (Score: 0.3069)
A woman is playing violin. (Score: 0.2378)


In [1]:
import requests
from bs4 import BeautifulSoup
import re

url_list = ['https://mojim.com/twh105703.htm',
'https://mojim.com/twh100163.htm',
'https://mojim.com/twh100090.htm',
'https://mojim.com/twh100095.htm',
'https://mojim.com/twh104613.htm',
'https://mojim.com/twh109122.htm',
'https://mojim.com/twh116318.htm',
'https://mojim.com/twh100663.htm',
'https://mojim.com/twh100098.htm',
'https://mojim.com/twh100312.htm',
'https://mojim.com/twh104821.htm',
'https://mojim.com/twh105599.htm',
'https://mojim.com/twh104279.htm',
'https://mojim.com/twh100166.htm',
'https://mojim.com/twh105079.htm',
'https://mojim.com/twh102453.htm',
'https://mojim.com/twh108440.htm',
'https://mojim.com/twh118727.htm',
'https://mojim.com/twh100158.htm',
'https://mojim.com/twh129810.htm',
'https://mojim.com/twh122960.htm',
'https://mojim.com/twh100115.htm',
'https://mojim.com/twh222189.htm',
'https://mojim.com/twh100287.htm',
'https://mojim.com/twh119630.htm',
'https://mojim.com/twh218008.htm',
'https://mojim.com/twh105272.htm',
'https://mojim.com/twh100593.htm',
'https://mojim.com/twh100173.htm',
'https://mojim.com/twh100232.htm',
'https://mojim.com/twh202390.htm',
'https://mojim.com/twh104870.htm',
'https://mojim.com/twh100187.htm',
'https://mojim.com/twh137200.htm',
'https://mojim.com/twh110993.htm',
'https://mojim.com/twh213714.htm',
'https://mojim.com/twh100954.htm',
'https://mojim.com/twh109805.htm',
'https://mojim.com/twh100074.htm',
'https://mojim.com/twh100102.htm',
'https://mojim.com/twh109601.htm',
'https://mojim.com/twh104942.htm',
'https://mojim.com/twh136088.htm',
'https://mojim.com/twh100118.htm',
'https://mojim.com/twh135268.htm',
'https://mojim.com/twh100078.htm',
'https://mojim.com/twh100340.htm',
'https://mojim.com/twh100053.htm',
'https://mojim.com/twh100122.htm',
'https://mojim.com/twh169207.htm']

def writeFile(i):
        j = i.get('href')
        sub_resp = requests.get("https://mojim.com" + j)
        sub_soup = BeautifulSoup(sub_resp.text, 'html.parser')
        sub_name = sub_soup.find('dd','fsZx3')
        
        s = str(sub_name)    #轉換成字符串
        s_replace = s.replace('<br/>',"\n")    #用換行符替換'<br/>'
        while True:                      #用換行符替換所有的'<br/>'
            index_begin = s_replace.find("<")
            index_end = s_replace.find(">",index_begin + 1)
            if index_begin == -1:
                break
            s_replace = s_replace.replace(s_replace[index_begin:index_end+1],"")

        path = "女歌手/" + i.get('title').split(' 歌')[0].split('/')[0] + ".txt"
        f = open(path, 'w')
        f.write(s_replace)

for url in url_list:

    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    block = soup.find_all('dd', 'hb2')
    block2 = soup.find_all('dd','hb3')

    try:
        for i in block: 
            song_left = i.find_all('span','hc3')
            for j in song_left:
                sl = j.find_all('a')
                for k in sl:
                    writeFile(k)
                        
            song_right = i.find_all('span','hc4')
            for j in song_right:
                sr = j.find_all('a')
                for k in sr:
                    writeFile(k)
                
        for i in block2: 
            song_left = i.find_all('span','hc3')
            for j in song_left:
                sl = j.find_all('a')
                for k in sl:
                    writeFile(k)
                        
            song_right = i.find_all('span','hc4')
            for j in song_right:
                sr = j.find_all('a')
                for k in sr:
                    writeFile(k)
    except:
        print('skip!')

skip!


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-ed06b4626a10>", line 77, in <module>
    resp = requests.get(url)
  File "/usr/local/lib/python3.7/site-packages/requests/api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "/usr/local/lib/python3.7/site-packages/requests/api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 646, in send
    r = adapter.send(request, **kwargs)
  File "/usr/local/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
    timeout=timeout
  File "/usr/local/lib/python3.7/site-packages/urllib3/connecti

KeyboardInterrupt: 