In [None]:
import pandas as pd


csv_file_path = 'documents.csv'

df = pd.read_csv(
    csv_file_path,
    quotechar='"',  
    escapechar='\\', 
    delimiter=',', 
    encoding='utf-8',  
    engine='python' 
)


def clean_text(text):
    if pd.isna(text):  
        return ""
    return text.replace('', '').strip()  


df['Title'] = df['Title'].apply(clean_text)
df['Body'] = df['Body'].apply(clean_text)


print(df.head())


output_cleaned_csv = 'cleaned_documents.csv'
df.to_csv(output_cleaned_csv, index=False, encoding='utf-8')


print(f"Cleaned data saved to: {output_cleaned_csv}")


   New_ID  Old_ID Train_Test  \
0       1    5544      TRAIN   
1       2    5545      TRAIN   
2       3    5546      TRAIN   
3       4    5547      TRAIN   
4       5    5548      TRAIN   

                                              Title  \
0                                BAHIA COCOA REVIEW   
1         STANDARD OIL <SRD> TO FORM FINANCIAL UNIT   
2        TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN   
3      TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER   
4  NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE   

                                                Body  
0  Showers continued throughout the week in the B...  
1  Standard Oil Co and BP North America Inc said ...  
2  Texas Commerce Bancshares Inc's Texas Commerce...  
3  BankAmerica Corp is not under pressure to act ...  
4  The U.S. Agriculture Department reported the f...  
Cleaned data saved to: cleaned_documents.csv


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


csv_file_path = 'cleaned_documents.csv'  
df = pd.read_csv(csv_file_path)


def generate_shingles(text, k=3):
    if pd.isna(text):  
        return []

    words = [word for word in text.split() if word.strip()]
    # 生成3-shingles
    shingles = [' '.join(words[i:i + k]) for i in range(len(words) - k + 1)]
    return shingles

df['Shingles'] = df['Body'].apply(generate_shingles)


df['Shingle_Text'] = df['Shingles'].apply(lambda x: ' '.join(x))


vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), binary=True)
shingle_matrix = vectorizer.fit_transform(df['Shingle_Text'])


shingle_matrix_df = pd.DataFrame(
    shingle_matrix.T.toarray(),  
    index=vectorizer.get_feature_names_out(),  
    columns=df['New_ID'] 
)


output_csv_path = 'shingle_binary_matrix.csv'  
shingle_matrix_df.to_csv(output_csv_path)


print(f"3-Shingle binary matrix has been saved to: {output_csv_path}")




3-Shingle binary matrix has been saved to: shingle_binary_matrix.csv


In [11]:
import pandas as pd
import numpy as np


shingle_matrix_path = 'shingle_binary_matrix.csv'
shingle_matrix_df = pd.read_csv(shingle_matrix_path, index_col=0)


binary_matrix = shingle_matrix_df.to_numpy()


H = 100  
N = binary_matrix.shape[1]  


minhash_matrix = np.full((H, N), np.inf)

np.random.seed(42)  
a = np.random.randint(1, 1e6, H)
b = np.random.randint(0, 1e6, H)
p = 2**31 - 1  
num_shingles = binary_matrix.shape[0]


def hash_func(x, a, b, p, num_shingles):
    return ((a * x + b) % p) % num_shingles


for row_id in range(num_shingles):
    shingle_vector = binary_matrix[row_id, :]
    hash_values = hash_func(row_id, a[:, None], b[:, None], p, num_shingles)
    minhash_matrix[:, shingle_vector == 1] = np.minimum(
        minhash_matrix[:, shingle_vector == 1], hash_values
    )


minhash_matrix_df = pd.DataFrame(minhash_matrix)
output_minhash_path = 'minhash_signature_matrix.csv'
minhash_matrix_df.to_csv(output_minhash_path, index=False)


print(f"MinHash signature matrix has been saved to: {output_minhash_path}")


MinHash signature matrix has been saved to: minhash_signature_matrix.csv


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict


minhash_matrix_path = 'minhash_signature_matrix.csv'
minhash_matrix = pd.read_csv(minhash_matrix_path).to_numpy()


H, N = minhash_matrix.shape 
b = 20 
r = H // b  

assert H % b == 0, "H 要被b整除"


candidate_pairs = set()


for band in range(b):
    start_row = band * r
    end_row = start_row + r
    band_matrix = minhash_matrix[start_row:end_row, :]


    buckets = defaultdict(list)
    

    for doc_id in range(N):
        band_signature = tuple(band_matrix[:, doc_id])
        buckets[band_signature].append(doc_id)


    for bucket in buckets.values():
        if len(bucket) > 1:  
            for i in range(len(bucket)):
                for j in range(i + 1, len(bucket)):
                    candidate_pairs.add((bucket[i], bucket[j]))


candidate_pairs_df = pd.DataFrame(list(candidate_pairs), columns=["Document1", "Document2"])


candidate_pairs_path = 'candidate_pairs.csv'
candidate_pairs_df.to_csv(candidate_pairs_path, index=False)

print(f"Candidate pairs have been saved to: {candidate_pairs_path}")


Candidate pairs have been saved to: candidate_pairs.csv
