### 采用预训练模型做词嵌入处理

1. 导入数据到csv文件

In [None]:
import pandas as pd
from sqlalchemy import create_engine


# 数据库连接参数
server = ''  # 服务器名称或IP地址
database = ''  # 数据库名称
username = ''  # 用户名
password = ''  # 密码
driver = 'ODBC Driver 17 for SQL Server'

connection_string = f'mssql+pyodbc://{username}:{password}@{server}/{database}?driver={driver}'
engin = create_engine(connection_string)

RowData = pd.read_sql('''SELECT LeadGUID, LeadNo, FirstName, LastName, CompanyName, MobilePhone, 
                      TellPhone, Email, TypedCountry, Description, ConvertToLead FROM LeadFromSHNWebSite 
                      WHERE SysCountryID <> 7 AND RequestDate IS NOT NULL AND Description IS NOT NULL AND CreateUser = \'System\'''', engin)

RowData.to_csv('data.csv', index=False)
engin.dispose()

2. 读取CSV文件，并通过tokenizer进行分词统计，以找到最大token长度

In [None]:
import nltk
import pandas as pd

nltk.download()

df = pd.read_csv('data.csv')
maxLength = 0
for index, row in df.iterrows():
    if pd.notna(row['Description']):
        token = nltk.word_tokenize(row['Description'])
        if len(token) > maxLength:
            maxLength = len(token)
print(maxLength)

3. 采用SentenceTransformer对文本进行嵌入，示例如下

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sentences = ["This is the first sentence", "This is the second sentence"]
embeddings = model.encode(sentences)
print(embeddings)

注意：该模型最大输入token长度为256，我们需要采用Padding算法或者滑窗算法对长文本做处理，以准确获得384维向量。

4. 采用BGE-M3算法做词嵌入，示例如下

In [None]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # 设置fp16为True，可以加速模型推理

sentences_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

embeddings_1 = model.encode(sentences_1, 
                            batch_size=12, 
                            max_length=8192,
                            )['dense_vecs']
embeddings_2 = model.encode(sentences_2)['dense_vecs']
similarity = embeddings_1 @ embeddings_2.T # 向量内积
print(similarity)

5. 数据基本清理

In [None]:
import re

def clean_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'website', text)

def remove_emails(text):
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    return email_pattern.sub('email', text)

def remove_ids(text):
    pattern = re.compile(r'\[.*?\]')
    return pattern.sub('', text)

def remove_non_ascii(text):
    non_ascii_pattern = re.compile(r'[^\x00-\x7F]+')
    return non_ascii_pattern.sub('', text)

def clean(text):
    cleaned_text = clean_url(text)
    cleaned_text = remove_emails(cleaned_text)
    cleaned_text = remove_ids(cleaned_text)
    cleaned_text = remove_non_ascii(cleaned_text)
    return cleaned_text