#Method 3: Hashing Vectorizer Based Similarity

In [5]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

# Load the dataset
train_data_path = '/content/drive/My Drive/data-train.csv'
train_df = pd.read_csv(train_data_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Sampling 10,000 essays and finding the top 10 pair of similar essays using HashingVectorizer

In [9]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import time

# Start measuring time
start_time = time.time()

# Remove essays containing "PROPER_NAME" and remove duplicates
filtered_df = train_df[~train_df['full_text'].str.contains("PROPER_NAME", na=False)].drop_duplicates(subset='full_text')

# Count the number of essays after filtering
num_essays_after_filtering = filtered_df.shape[0]

# Output the results
print(f"Number of essays after filtering: {num_essays_after_filtering}")

# Sample a subset of the essays
sample_size = min(10000, filtered_df.shape[0])
sampled_df = filtered_df.sample(n=sample_size, random_state=42).reset_index(drop=True)

# Extract the 'full_text' column from the sampled dataset
essays = sampled_df['full_text']

# Create vectors using HashingVectorizer
vectorizer = HashingVectorizer(n_features=10000, alternate_sign=False)
essay_vectors = vectorizer.fit_transform(essays)

# Calculate cosine similarity for the sampled essays
print("Calculating pairwise cosine similarity for the sampled essays using Hashing Vectorizer...")
similarity_matrix = cosine_similarity(essay_vectors)

# Find the top 10 similar pairs (excluding self-pairs and exact duplicates)
num_top_pairs = 10
top_similar_pairs = []

for i in range(similarity_matrix.shape[0]):
    for j in range(i + 1, similarity_matrix.shape[0]):
        if similarity_matrix[i, j] < 1.0:  # Exclude exact duplicates (score of 1.0)
            top_similar_pairs.append((i, j, similarity_matrix[i, j]))

# Sort the pairs by similarity score in descending order and get the top pairs
top_similar_pairs = sorted(top_similar_pairs, key=lambda x: -x[2])[:num_top_pairs]

# Stop measuring time
end_time = time.time()
execution_time = end_time - start_time

# Print the execution time
print(f"Execution time: {execution_time:.2f} seconds")

# Print the top similar pairs with their essay IDs and one-line previews
print("\nTop Similar Pairs of Essays among 10000 samples (Hashing Vectorizer):")
for idx, (i, j, similarity) in enumerate(top_similar_pairs):
    essay_id_1, essay_id_2 = sampled_df.loc[i, 'essay_id'], sampled_df.loc[j, 'essay_id']
    preview_1, preview_2 = sampled_df.loc[i, 'full_text'][:100], sampled_df.loc[j, 'full_text'][:100]
    print(f"Pair {idx + 1}: Essay IDs {essay_id_1} and {essay_id_2} with Similarity Score: {similarity:.4f}")


Number of essays after filtering: 17028
Calculating pairwise cosine similarity for the sampled essays using Hashing Vectorizer...
Execution time: 175.95 seconds

Top Similar Pairs of Essays among 10000 samples (Hashing Vectorizer):
Pair 1: Essay IDs 29aa983 and 6d25307 with Similarity Score: 0.9547
Pair 2: Essay IDs 84a1b1a and 287ed5e with Similarity Score: 0.9540
Pair 3: Essay IDs 77b1295 and 84a1b1a with Similarity Score: 0.9533
Pair 4: Essay IDs 84a1b1a and 66ee32e with Similarity Score: 0.9523
Pair 5: Essay IDs 5bcf9b0 and 84a1b1a with Similarity Score: 0.9510
Pair 6: Essay IDs ef95422 and ea57a9c with Similarity Score: 0.9507
Pair 7: Essay IDs e026924 and ebe2ce0 with Similarity Score: 0.9499
Pair 8: Essay IDs 99e37ba and ebe2ce0 with Similarity Score: 0.9498
Pair 9: Essay IDs 77b1295 and 66ee32e with Similarity Score: 0.9496
Pair 10: Essay IDs 77b1295 and ebe2ce0 with Similarity Score: 0.9488


In [7]:
#Print the preview of the essay
print("\nTop Similar Pairs of Essays (TF-IDF Cosine Similarity):")
for idx, (i, j, similarity) in enumerate(top_similar_pairs):
    essay_id_1, essay_id_2 = sampled_df.loc[i, 'essay_id'], sampled_df.loc[j, 'essay_id']
    preview_1, preview_2 = sampled_df.loc[i, 'full_text'][:150], sampled_df.loc[j, 'full_text'][:150]
    print(f"Pair {idx + 1}: Essay IDs {essay_id_1} and {essay_id_2} with Similarity Score: {similarity:.4f}")
    print(f"Essay {essay_id_1} (Preview): {preview_1[:150]}...")
    print(f"Essay {essay_id_2} (Preview): {preview_2[:150]}...\n")


Top Similar Pairs of Essays (TF-IDF Cosine Similarity):
Pair 1: Essay IDs 29aa983 and 6d25307 with Similarity Score: 0.9547
Essay 29aa983 (Preview): A new hom

whould you send someone to explore venus with the technology that we haave

right now? the author of the article

the challege of exploring...
Essay 6d25307 (Preview): Benefits of Researching a New planet

Whould you send someone to explore venus with even if we don't have the technology necessary?The author of the a...

Pair 2: Essay IDs 84a1b1a and 287ed5e with Similarity Score: 0.9540
Essay 84a1b1a (Preview): Dear state senator,

Do you think that we should keep the Electoral College? We should keep the electoral college because the founding fathers establi...
Essay 287ed5e (Preview): I think the Electoral college is a good way to vote for the president or vice president because i think without the Electoral college, we wouldn't hav...

Pair 3: Essay IDs 77b1295 and 84a1b1a with Similarity Score: 0.9533
Essay 77b1295 (Previe