<a href="https://colab.research.google.com/github/sk-226/comparison-of-text-similarity/blob/main/comparsion_of_text_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GDSC Summer Hackathon Team-2

In [29]:
!pip install numpy scikit-learn nltk transformers torch sentence-transformers janome



In [30]:
import time
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import torch
from transformers import BertTokenizer, BertModel

## 英語

In [31]:
# nltkの初期設定
nltk.download('punkt')  # トークン化用データセットをダウンロード
nltk.download('stopwords')  # 停止語のデータセットをダウンロード
stop_words = set(stopwords.words('english'))  # 英語の停止語を取得

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
# 題名の正解と回答を変数に格納
correct_title = "A beautiful sunset over the mountains"
proposed_title = "A gorgeous sunset in the hills"

In [33]:
# 1. Cosine Similarity
def cosine_similarity_example(correct, proposed):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([correct, proposed])
    cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])
    return cosine_sim[0][0]

# 2. Jaccard Similarity
def jaccard_similarity(correct, proposed):
    correct_tokens = set(word_tokenize(correct.lower()))
    proposed_tokens = set(word_tokenize(proposed.lower()))

    correct_tokens = correct_tokens.difference(stop_words)
    proposed_tokens = proposed_tokens.difference(stop_words)

    intersection = correct_tokens.intersection(proposed_tokens)
    union = correct_tokens.union(proposed_tokens)

    return len(intersection) / len(union)

# 3. Word Embedding (using Sentence-BERT)
def word_embedding_similarity(correct, proposed):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    correct_emb = model.encode(correct, convert_to_tensor=True)
    proposed_emb = model.encode(proposed, convert_to_tensor=True)

    cosine_sim = torch.nn.functional.cosine_similarity(correct_emb, proposed_emb, dim=0)
    return cosine_sim.item()

# 4. BERT (using HuggingFace transformers)
def bert_similarity(correct, proposed):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    inputs_correct = tokenizer(correct, return_tensors='pt', truncation=True, padding=True)
    inputs_proposed = tokenizer(proposed, return_tensors='pt', truncation=True, padding=True)

    with torch.no_grad():
        correct_emb = model(**inputs_correct).last_hidden_state.mean(dim=1)
        proposed_emb = model(**inputs_proposed).last_hidden_state.mean(dim=1)

    cosine_sim = torch.nn.functional.cosine_similarity(correct_emb, proposed_emb)
    return cosine_sim.item()


def measure_time(func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return result, elapsed_time

In [34]:
# 実行
cosine_sim, cosine_time = measure_time(cosine_similarity_example, correct_title, proposed_title)
jaccard_sim, jaccard_time = measure_time(jaccard_similarity, correct_title, proposed_title)
word_emb_sim, word_emb_time = measure_time(word_embedding_similarity, correct_title, proposed_title)
bert_sim, bert_time = measure_time(bert_similarity, correct_title, proposed_title)

In [35]:
# 結果を出力
print(f"Cosine Similarity: {cosine_sim:.4f} (Time: {cosine_time:.4f} seconds)")
print(f"Jaccard Similarity: {jaccard_sim:.4f} (Time: {jaccard_time:.4f} seconds)")
print(f"Word Embedding Similarity: {word_emb_sim:.4f} (Time: {word_emb_time:.4f} seconds)")
print(f"BERT Similarity: {bert_sim:.4f} (Time: {bert_time:.4f} seconds)")

Cosine Similarity: 0.2523 (Time: 0.0125 seconds)
Jaccard Similarity: 0.2000 (Time: 0.0009 seconds)
Word Embedding Similarity: 0.9188 (Time: 1.1134 seconds)
BERT Similarity: 0.9445 (Time: 1.7191 seconds)


## 日本語

In [36]:
import time
import numpy as np
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
from transformers import BertTokenizer, BertModel

In [37]:
# Janomeの形態素解析器を初期化
tokenizer = Tokenizer()

In [38]:
# 日本語テキストの正解と回答を変数に格納
correct_title = "美しい夕日が山々の上に広がる"
proposed_title = "壮大な夕焼けが丘に映る"

In [39]:
# 日本語テキストを形態素解析して単語に分割
def tokenize(text):
    return " ".join([token.surface for token in tokenizer.tokenize(text)])

# 1. Cosine Similarity
def cosine_similarity_example(correct, proposed):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([correct, proposed])
    cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])
    return cosine_sim[0][0]

# 2. Jaccard Similarity
def jaccard_similarity(correct, proposed):
    correct_tokens = set(tokenize(correct).split())
    proposed_tokens = set(tokenize(proposed).split())

    intersection = correct_tokens.intersection(proposed_tokens)
    union = correct_tokens.union(proposed_tokens)

    return len(intersection) / len(union)

# 3. Word Embedding (using Sentence-BERT)
def word_embedding_similarity(correct, proposed):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    correct_emb = model.encode(correct, convert_to_tensor=True)
    proposed_emb = model.encode(proposed, convert_to_tensor=True)

    cosine_sim = torch.nn.functional.cosine_similarity(correct_emb, proposed_emb, dim=0)
    return cosine_sim.item()

# 4. BERT (using HuggingFace transformers, 日本語BERTモデルを使用)
def bert_similarity(correct, proposed):
    tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
    model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese')

    inputs_correct = tokenizer(correct, return_tensors='pt', truncation=True, padding=True)
    inputs_proposed = tokenizer(proposed, return_tensors='pt', truncation=True, padding=True)

    with torch.no_grad():
        correct_emb = model(**inputs_correct).last_hidden_state.mean(dim=1)
        proposed_emb = model(**inputs_proposed).last_hidden_state.mean(dim=1)

    cosine_sim = torch.nn.functional.cosine_similarity(correct_emb, proposed_emb)
    return cosine_sim.item()

# 時間を計測する部分
def measure_time(func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return result, elapsed_time

In [40]:
# 日本語テキストを形態素解析してトークン化
correct_title_tokenized = tokenize(correct_title)
proposed_title_tokenized = tokenize(proposed_title)

# 実行
cosine_sim, cosine_time = measure_time(cosine_similarity_example, correct_title_tokenized, proposed_title_tokenized)
jaccard_sim, jaccard_time = measure_time(jaccard_similarity, correct_title_tokenized, proposed_title_tokenized)
word_emb_sim, word_emb_time = measure_time(word_embedding_similarity, correct_title, proposed_title)
bert_sim, bert_time = measure_time(bert_similarity, correct_title, proposed_title)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [41]:
# 結果を出力
print(f"Cosine Similarity: {cosine_sim:.4f} (Time: {cosine_time:.4f} seconds)")
print(f"Jaccard Similarity: {jaccard_sim:.4f} (Time: {jaccard_time:.4f} seconds)")
print(f"Word Embedding Similarity: {word_emb_sim:.4f} (Time: {word_emb_time:.4f} seconds)")
print(f"BERT Similarity: {bert_sim:.4f} (Time: {bert_time:.4f} seconds)")

Cosine Similarity: 0.0000 (Time: 0.0112 seconds)
Jaccard Similarity: 0.1538 (Time: 0.0036 seconds)
Word Embedding Similarity: 0.8893 (Time: 1.1631 seconds)
BERT Similarity: 0.9198 (Time: 3.7172 seconds)
