<a href="https://colab.research.google.com/github/sudoghut/chinese-book-classification/blob/main/chinese_book_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
!pip install tqdm
!pip install -U sentence-transformers
!pip install -U xgboost



In [66]:
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os
import datetime

In [67]:
with open('input.txt', 'r') as f:
  titles = [title.strip() for title in f.readlines()]
titles[:5]

['靑苔篇', '蜀游草', '温泉扈从恭纪诗', '消寒游艺全集', '读书纪数略']

In [68]:
if os.path.exists('/content/model.bin'):
  print('Model already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/model.bin

if os.path.exists('/content/titles_all.txt'):
  print('titles_all already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/titles_all.txt

if os.path.exists('/content/training.xlsx'):
  print('training.xlsx already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/training.xlsx

if os.path.exists('/content/book_category_match.xlsx'):
  print('book_category_match.xlsx already exists. Skipping download.')
else:
  !wget https://raw.githubusercontent.com/sudoghut/chinese-book-classification/main/book_category_match.xlsx


Model already exists. Skipping download.
titles_all already exists. Skipping download.
training.xlsx already exists. Skipping download.
book_category_match.xlsx already exists. Skipping download.


In [69]:
prepared_title_list = [line.rstrip('\n') for line in open('/content/titles_all.txt')]
title_all = titles + prepared_title_list

In [70]:
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0)

top_number_setting = 50
def similarity_calculation(model):
    top_similarities_list = []
    total = len(titles) * len(prepared_title_list)

    with tqdm(total=total, desc="Calculating Similarities") as pbar:
        embeddings = model.encode(title_all)
        for i in range(len(titles)):
            similarities = []
            for j in range(len(titles), len(title_all)):
                similarity = cosine_similarity(torch.tensor(embeddings[i]), torch.tensor(embeddings[j]))
                try:
                    similarities.append(((titles[i], prepared_title_list[j-len(titles)]), similarity.item()))
                except:
                    print(i)
                    print(j)
                    raise
                pbar.update(1)
            # print(len(similarities))
            sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
            if len(sorted_similarities) < top_number_setting:
                top_similarities = sorted_similarities
            else:
                top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_number_setting]
            # print(top_similarities[:5])
            # for pair, sim in top_similarities:
            #     print(f"Cosine similarity between \"{pair[0]}\" and \"{pair[1]}\": {sim}")
            top_similarities_list.append(top_similarities)
    top_similarities_list = [[j[0][0], j[0][1], "{:.4f}".format(j[1])] for i in top_similarities_list for j in i]

    return top_similarities_list

In [71]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
top_similarities_list = similarity_calculation(model)

Calculating Similarities: 100%|██████████| 141408/141408 [00:10<00:00, 14116.74it/s]


In [72]:
df = pd.DataFrame(top_similarities_list, columns=['title_1', 'title_2', 'similarity'])
print(df.shape)
df[:5]

(800, 3)


Unnamed: 0,title_1,title_2,similarity
0,靑苔篇,鹤林玉露,0.9546
1,靑苔篇,枣林杂俎,0.9474
2,靑苔篇,蕉叶帕,0.9399
3,靑苔篇,菊磵集,0.9363
4,靑苔篇,天彭牡丹谱,0.9363


In [73]:
model = xgb.XGBClassifier()
model.load_model("/content/model.bin")
df_train_for_vectorizer = pd.read_excel("/content/training.xlsx")

combined_data = pd.concat(
    [df_train_for_vectorizer["x_1"], df_train_for_vectorizer["x_2"]]
)
vectorizer = TfidfVectorizer(analyzer="char")
vectorizer.fit(combined_data)
X_1 = vectorizer.transform(df["title_1"])
X_2 = vectorizer.transform(df["title_2"])
X = np.hstack((X_1.toarray(), X_2.toarray()))
print(X.shape)
predicted_y = model.predict(X)
df["predicted_y"] = predicted_y
df_list = df.values.tolist()
# df.to_excel("compare_categories_all.xlsx", index=False)
compare_categories_summarized = []
count = 0
match_flag = False
previous_row = df_list[0]
for row in df_list:
    # First record got predicted_y = 1
    if row[3] == 1 and row[0] != previous_row[0]:
        compare_categories_summarized.append(row + ["first predict"])
        previous_row = row
        match_flag = True
        count += 1
    # Within the same title match, the first record got predicted_y = 1
    elif (
        row[3] == 1
        and row[0] == previous_row[0]
        and previous_row[3] == 0
        and match_flag == False
    ):
        compare_categories_summarized.append(row + ["inner predict"])
        previous_row = row
        if count == (top_number_setting - 1):
            count = 0
        else:
            count += 1
            match_flag = True
    # At the last record. Can't find any predicted_y = 1 within the same title match
    elif count == (top_number_setting - 1):
        if match_flag == False:
            compare_categories_summarized.append([row[0], "", "", 0, "no match"])
        previous_row = row
        count = 0
        match_flag = False
    else:
        count += 1
        previous_row = row
# export compare_categories_summarized to excel
df_summarized = pd.DataFrame(
    compare_categories_summarized,
    columns=["title_1", "title_2", "similarity", "predicted_y", "reason"],
)
# df_summarized.to_excel("compare_categories_summarized.xlsx", index=False)

(800, 5618)


In [74]:
title_1_with_zero_prediction = df_summarized[df_summarized["predicted_y"] == 0]["title_1"]
df_summarized_no_predicted_y = df[df["title_1"].isin(title_1_with_zero_prediction)]
df_summarized_no_predicted_y.head()

Unnamed: 0,title_1,title_2,similarity,predicted_y
0,靑苔篇,鹤林玉露,0.9546,0
1,靑苔篇,枣林杂俎,0.9474,0
2,靑苔篇,蕉叶帕,0.9399,0
3,靑苔篇,菊磵集,0.9363,0
4,靑苔篇,天彭牡丹谱,0.9363,0


In [75]:
def calculate_similarity(title1, title2):
    title1 = "".join(set(title1))
    title2 = "".join(set(title2))
    similarity = 0

    for i in range(len(title1)):
        if title1[i] in title2:
            similarity += 1
    return similarity

# Apply the function to calculate similarity for each pair of titles
df_summarized_no_predicted_y["similarity"] = df_summarized_no_predicted_y.apply(
    lambda row: calculate_similarity(row["title_1"], row["title_2"]), axis=1
)

# # Function to select the top 'n' similar pairs for each title1 and export to a new CSV
def export_top_similar_pairs(df_summarized_no_predicted_y, n):
    # Sort by similarity in descending order
    df_summarized_no_predicted_y = df_summarized_no_predicted_y.sort_values(by="similarity", ascending=False)

    # Group by title_1 and select top 'n' for each title_1
    top_pairs = df_summarized_no_predicted_y.groupby("title_1").head(n)

    # Skip the records where similarity is 0
    top_pairs = top_pairs[top_pairs["similarity"] != 0]

    return top_pairs




simple_similarity_top_pairs = export_top_similar_pairs(df_summarized_no_predicted_y, 1)
# simple_similarity_top_pairs.to_excel("simple_similarity_top_pairs.xlsx", index=False)
simple_similarity_top_pairs.head()

Unnamed: 0,title_1,title_2,similarity,predicted_y
200,读书纪数略,读书纪数略,5,0
550,书史会要,书史会要,4,0
300,文苑英华选,文苑英华,4,0
500,辍耕录,辍耕录,3,0
285,甘一史名臣言行录,明儒言行录,3,0


In [76]:
for i, row in df_summarized.iterrows():
    if row["predicted_y"] == 0:
        try:
            df_summarized["title_2"][i] = simple_similarity_top_pairs[simple_similarity_top_pairs["title_1"] == row["title_1"]]["title_2"].values[0]
            df_summarized["predicted_y"][i] = 1
            df_summarized["reason"][i] = "simple similarity"
        except:
            df_summarized["title_2"][i] = ""

df_summarized.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summarized["title_2"][i] = ""
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summarized["title_2"][i] = simple_similarity_top_pairs[simple_similarity_top_pairs["title_1"] == row["title_1"]]["title_2"].values[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summarized["predicted_y"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

Unnamed: 0,title_1,title_2,similarity,predicted_y,reason
0,靑苔篇,,,0,no match
1,蜀游草,凌沧草,,1,simple similarity
2,温泉扈从恭纪诗,夏完淳诗词曲赋,,1,simple similarity
3,消寒游艺全集,全室外集,,1,simple similarity
4,读书纪数略,读书纪数略,,1,simple similarity


In [77]:
book_category_match = pd.read_excel("/content/book_category_match.xlsx")
df_summarized["category"] = ""

df_summarized.head()
for i, row in df_summarized.iterrows():
    try:
        df_summarized["category"][i] = book_category_match[book_category_match["title"] == row["title_2"]]["category"].values[0]
    except:
        pass
df_summarized.head()

now = datetime.datetime.now()
now = now.strftime("%Y-%m-%d %H:%M:%S")

output_filename = f"book_category_match_output_{now}.xlsx"

df_summarized.to_excel(output_filename, index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summarized["category"][i] = book_category_match[book_category_match["title"] == row["title_2"]]["category"].values[0]


In [78]:
from google.colab import files

files.download(f'/content/{output_filename}')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>