<a href="https://colab.research.google.com/github/sudoghut/chinese-book-classification/blob/main/chinese_book_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tqdm
!pip install -U sentence-transformers
!pip install -U xgboost

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=c4c77f605ba4c8da7ffe479a56ed050defef75f4c9bfd497fac9701daa25d4c7
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [2]:
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os

In [3]:
with open('input.txt', 'r') as f:
  titles = [title.strip() for title in f.readlines()]
titles[:5]

['进修堂文集诗集',
 '马书奎稿',
 '游莱三体诗稿',
 '鲍觉生诗钞',
 '赋则',
 '觉生诗钞',
 '咏古诗钞',
 '咏史诗钞',
 '咏物诗钞',
 '咏物咏史感应诗',
 '补竹轩集',
 '澄怀园诗',
 '澄怀园诗集',
 '杂著',
 '听桐山馆集',
 '听桐山馆诗集',
 '四书题解',
 '守素堂诗集',
 '附日记',
 '味苏斋诗文集',
 '益坚斋诗文钞',
 '宛委山房稿',
 '咏典堂集',
 '崇祯五十辅臣传',
 '古林金石表',
 '国朝续征献录',
 '静惕堂词',
 '静惕堂诗集',
 '静惕堂诗文集',
 '静惕堂书目',
 '倦圃莳植记',
 '刘豫事迹',
 '明漕运志',
 '续献征录',
 '学海类编',
 '砚录',
 '白香山诗选',
 '带星草堂诗集',
 '诗钞',
 '石鼓砚斋文钞',
 '直庐集',
 '直庐集八集',
 '古诗唐诗约选',
 '古诗约选',
 '古雪斋诗集',
 '古雪斋诗文钞',
 '剑亭年谱',
 '唐诗约选',
 '翠微山房诗文集',
 '使蜀草',
 '周易集粹',
 '话云轩咏史诗',
 '纶阁延晖集',
 '宋四六选',
 '一罫轩诗钞',
 '名家词钞',
 '淸风堂集',
 '著井陉诗草',
 '天竹山房诗集',
 '杂著',
 '持身辑要',
 '梦吟仙馆诗草',
 '心经释注',
 '篆庐笔谈',
 '淸端集',
 '斗筑居稿',
 '活幼勲奇',
 '孔编',
 '山中集',
 '述仙草山中集',
 '述先草',
 '述先草山中集',
 '平政录学政纪略',
 '学政记略',
 '锦堂诗集',
 '百尺楼诗集',
 '楚帆集',
 '恩光集',
 '庆余堂吟稿',
 '生香书屋诗集',
 '生香书屋文集',
 '文集',
 '芝树堂诗草',
 '定庵漫兴集唐三十首',
 '画眉笔谈',
 '松籁阁诗集',
 '唐骈体文钞',
 '漫浪集',
 '淸峙集',
 '嘉惠堂集',
 '道山诗钞',
 '刊三辅采风录',
 '除豪集',
 '梦觉集',
 '淸照堂打包賸语',
 '嵊县志',
 '余庵杂录',
 '狷亭集',
 '微尘集',
 '芝峰诗钞',
 '太乙舟集',
 '太乙舟诗集',
 '葆元堂初集卷',
 '东冶集',
 '明史艺文志',
 '

In [4]:
if os.path.exists('/content/model.bin'):
  print('Model already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/model.bin

if os.path.exists('/content/titles_all.txt'):
  print('titles_all already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/titles_all.txt

if os.path.exists('/content/training.xlsx'):
  print('training.xlsx already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/training.xlsx

if os.path.exists('/content/book_category_match.xlsx'):
  print('book_category_match.xlsx already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/book_category_match.xlsx


--2024-01-20 23:14:20--  https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/model.bin
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3261903 (3.1M) [application/octet-stream]
Saving to: ‘model.bin’


2024-01-20 23:14:20 (45.3 MB/s) - ‘model.bin’ saved [3261903/3261903]

--2024-01-20 23:14:20--  https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/titles_all.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 130540 (127K) [text/plain]
Saving to: ‘titles_all.txt’


2024-01-20 23:14:2

In [5]:
prepared_title_list = [line.rstrip('\n') for line in open('/content/titles_all.txt')]
title_all = titles + prepared_title_list

In [6]:
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0)

top_number_setting = 50
def similarity_calculation(model):
    top_similarities_list = []
    total = len(titles) * len(prepared_title_list)

    with tqdm(total=total, desc="Calculating Similarities") as pbar:
        embeddings = model.encode(title_all)
        for i in range(len(titles)):
            similarities = []
            for j in range(len(titles), len(title_all)):
                similarity = cosine_similarity(torch.tensor(embeddings[i]), torch.tensor(embeddings[j]))
                try:
                    similarities.append(((titles[i], prepared_title_list[j-len(titles)]), similarity.item()))
                except:
                    print(i)
                    print(j)
                    raise
                pbar.update(1)
            # print(len(similarities))
            sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
            if len(sorted_similarities) < top_number_setting:
                top_similarities = sorted_similarities
            else:
                top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_number_setting]
            # print(top_similarities[:5])
            # for pair, sim in top_similarities:
            #     print(f"Cosine similarity between \"{pair[0]}\" and \"{pair[1]}\": {sim}")
            top_similarities_list.append(top_similarities)
    top_similarities_list = [[j[0][0], j[0][1], "{:.4f}".format(j[1])] for i in top_similarities_list for j in i]

    return top_similarities_list

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
top_similarities_list = similarity_calculation(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Calculating Similarities:  75%|███████▍  | 9942641/13292352 [07:26<02:25, 23034.28it/s]

In [None]:
df = pd.DataFrame(top_similarities_list, columns=['title_1', 'title_2', 'similarity'])
print(df.shape)
df[:5]

In [None]:
model = xgb.XGBClassifier()
model.load_model("/content/model.bin")
df_train_for_vectorizer = pd.read_excel("/content/training.xlsx")

combined_data = pd.concat(
    [df_train_for_vectorizer["x_1"], df_train_for_vectorizer["x_2"]]
)
vectorizer = TfidfVectorizer(analyzer="char")
vectorizer.fit(combined_data)
X_1 = vectorizer.transform(df["title_1"])
X_2 = vectorizer.transform(df["title_2"])
X = np.hstack((X_1.toarray(), X_2.toarray()))
print(X.shape)
predicted_y = model.predict(X)
df["predicted_y"] = predicted_y
df_list = df.values.tolist()
compare_categories_summarized = []
count = 0
match_flag = False
previous_row = df_list[0]
for row in df_list:
    # First record got predicted_y = 1
    if row[3] == 1 and row[0] != previous_row[0]:
        compare_categories_summarized.append(row + ["first predict"])
        previous_row = row
        match_flag = True
        count += 1
    # Within the same title match, the first record got predicted_y = 1
    elif (
        row[3] == 1
        and row[0] == previous_row[0]
        and previous_row[3] == 0
        and match_flag == False
    ):
        compare_categories_summarized.append(row + ["inner predict"])
        previous_row = row
        if count == (top_number_setting - 1):
            count = 0
        else:
            count += 1
            match_flag = True
    # At the last record. Can't find any predicted_y = 1 within the same title match
    elif count == (top_number_setting - 1):
        if match_flag == False:
            compare_categories_summarized.append([row[0], "", "", "0", "no match"])
        previous_row = row
        count = 0
        match_flag = False
    else:
        count += 1
        previous_row = row
# export compare_categories_summarized to excel
df_summarized = pd.DataFrame(
    compare_categories_summarized,
    columns=["title_1", "title_2", "similarity", "predicted_y", "reason"],
)
df_summarized.to_excel("compare_categories_summarized.xlsx", index=False)

In [None]:
title_1_with_zero_prediction = df_summarized[df_summarized["predicted_y"] == "0"]["title_1"]
df_summarized_no_predicted_y = df[df["title_1"].isin(title_1_with_zero_prediction)]
df_summarized_no_predicted_y.head()

In [None]:
def calculate_similarity(title1, title2):
    title1 = "".join(set(title1))
    title2 = "".join(set(title2))
    similarity = 0

    for i in range(len(title1)):
        if title1[i] in title2:
            similarity += 1
    return similarity

# Apply the function to calculate similarity for each pair of titles
df_summarized_no_predicted_y["similarity"] = df_summarized_no_predicted_y.apply(
    lambda row: calculate_similarity(row["title_1"], row["title_2"]), axis=1
)

# # Function to select the top 'n' similar pairs for each title1 and export to a new CSV
def export_top_similar_pairs(df_summarized_no_predicted_y, n):
    # Sort by similarity in descending order
    df_summarized_no_predicted_y = df_summarized_no_predicted_y.sort_values(by="similarity", ascending=False)

    # Group by title_1 and select top 'n' for each title_1
    top_pairs = df_summarized_no_predicted_y.groupby("title_1").head(n)

    # Skip the records where similarity is 0
    top_pairs = top_pairs[top_pairs["similarity"] != 0]

    return top_pairs


simple_similarity_top_pairs = export_top_similar_pairs(df_summarized_no_predicted_y, 1)
simple_similarity_top_pairs.head()


In [None]:
# prompt: update df_summarized["predicted_y"] = "0"'s df_summarized["title_2"]. by using simple_similarity_top_pairs["title_2"] when matching df_summarized["title_1"] = simple_similarity_top_pairs["title_1"]

for i, row in df_summarized.iterrows():
    if row["predicted_y"] == "0":
        df_summarized["title_2"][i] = simple_similarity_top_pairs[simple_similarity_top_pairs["title_1"] == row["title_1"]]["title_2"].values[0]
        df_summarized["predicted_y"][i] = 1
        df_summarized["reason"][i] = "simple similarity"

df_summarized.head()


In [None]:
book_category_match = pd.read_excel("/content/book_category_match.xlsx")
df_summarized["category"] = ""

df_summarized.head()
for i, row in df_summarized.iterrows():
    df_summarized["category"][i] = book_category_match[book_category_match["title"] == row["title_2"]]["category"].values[0]
df_summarized.head()
df_summarized.to_excel("df_summarized.xlsx", index=False)

