In [1]:
import pandas as pd

df = pd.read_excel("proposals_preprocess_0812_cleaned.xlsx")
print(df.shape)
empty_body = pd.read_excel("empty_body.xlsx")
print(empty_body.shape)

(20767, 39)
(253, 4)


In [62]:
# Title has [Election, Candidate] 

temp = df["title"].str.contains("Multi-sig Governance")
candidate = df[temp]
print(candidate.shape)
print(candidate[candidate["body"].str.contains("multi-sig owners")].shape)

df.loc[temp, "Class A"] = "Operations"
df.loc[temp, "Class B"] = "Team"
df.loc[temp, "Class C"] = "Hiring"

(45, 39)
(45, 39)


In [63]:
from util.extract.data import body_for_sim

replace = body_for_sim["replace"]
deleted = body_for_sim["deleted"]


for d in deleted["common"]:
    df["body"] = df["body"].str.replace(d, "", regex=False)
for k in deleted["specific"].keys():
    for d in deleted["specific"][k]:
        df.loc[df["space_id"] == k, "body"] = df.loc[df["space_id"] == k, "body"].str.replace(d, "", regex=False)

df["body_lower"] = df["body"].str.lower()
df["title_lower"] = df["title"].str.lower()
df = df.sort_values(by=['title_lower', 'body_lower'])

# lower에는 모든 영어 소문자와 숫자만 존재. 단 엔터 허용
df["body_lower"] = df["body_lower"].str.replace("[^a-z0-9\n]", " ", regex=True)
df["body_lower"] = df["body_lower"].str.strip()
df["title_lower"] = df["title_lower"].str.strip()
df = df[df["title_lower"].str.len() > 0]

In [64]:
temp1 = df["title"].str.contains("Multi-sig Governance") # 45 rows
temp2 = df["body_lower"].str.strip().str.len() == 0
df_ = df[~(temp1 | temp2)].sort_values(by=["body_lower"])

# With Categorized dataset

In [65]:
class_cols = ["Class A", "Class B", "Class C"]

# class_cols에 이름이 있는 열들에서 nan이 아닌 행만 가져오기
df_class = df_[df_[class_cols].notnull().any(axis=1)].sort_values(by=['body_lower'])
df_not = df_[~df_[class_cols].notnull().any(axis=1)].sort_values(by=['body_lower'])
df_class.shape, df_not.shape

((264, 39), (20205, 39))

In [66]:
# https://wikidocs.net/24603
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

df_body = pd.concat([df_class["body_lower"], df_not["body_lower"]])
X = vectorizer.fit_transform(df_body)
df_class_body = X[:df_class.shape[0]]
df_not_body = X[df_class.shape[0]:]

cos_mat = cosine_similarity(df_class_body, df_not_body)
print(cos_mat.shape)

(264, 20205)


In [68]:
columns = ["similarity", "id_c", "space_id_c", "title_c", "body_c", "link_c", "id_n", "space_id_n", "title_n", "body_n", "link_n"]
df_columns = ["id", "space_id", "title", "body", "link"]

def print_line(data):
    print("[ID]:", data["id"])
    print("[Space ID]:", data["space_id"])
    print("[Title]:", data["title"])
    print(data["body"])

def find_similar(threshold: int, threshold_upper: int=2, doPrint: bool = False):
    global cos_mat, df_class, df_not, columns, class_cols, df_columns

    temp = {k: None for k in columns + class_cols}
    result = []

    over = (cos_mat >= threshold) & (threshold_upper > cos_mat)


    for i in (i for i, v in enumerate(over.sum(axis=1)) if v > 0):
        y = df_class.iloc[i]
        if doPrint:
            print("-------------------------------------")
            print("-------------------------------------")
            print("[Category]:", y["Class A"], y["Class B"], y["Class C"], sep="\t")
            print_line(y)
            print("-------------------------------------")
        for v in df_columns:
            temp[v+"_c"] = y[v]
        for v in class_cols:
            temp[v] = y[v]

        
        for j in (idx for idx, v in enumerate(over[i]) if v): 
            x = df_not.iloc[j]
            if doPrint:
                print(f"{cos_mat[i, j]}")
                print_line(x)
                print("======================\n\n")
            temp["similarity"] = cos_mat[i, j]
            for v in df_columns:
                temp[v+"_n"] = x[v]
            result.append(temp.copy())
    return pd.DataFrame(result)

In [69]:
panel = [2, 1, 0.9, 0.8, 0.7, 0.6]
df_panel = {}
for i in range(len(panel)-1):
    result = find_similar(threshold=panel[i+1], threshold_upper=panel[i])
    df_panel[f"over{panel[i+1]}"] = result
    print(result.shape)


(50, 14)
(58, 14)
(76, 14)
(66, 14)
(60, 14)


In [70]:
output_file = 'similar_categorized.xlsx'

with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
    for k in df_panel.keys():
        df_panel[k].to_excel(writer, sheet_name=k, index=False)

# Without Categorized Dataset 

In [None]:
!pip install -q networkx

In [76]:
df_not = df_not[~df_not["id"].isin(df_panel["over1"]["id_n"])]
df_not = df_not[~df_not["id"].isin(df_panel["over0.9"]["id_n"])]

In [77]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_not["body_lower"])
cos_mat = cosine_similarity(X, X)
cos_mat.shape

(20141, 20141)

In [109]:
import networkx as nx

def find_similar_cluster(threshold: int, threshold_upper:int=2):
    G = nx.Graph()
    for idx in range(len(df_not)):
        G.add_node(df_not.iloc[idx]["id"])

    over = (cos_mat >= threshold) & (threshold_upper > cos_mat) 
    for i in range(len(df_not)):
        for j in range(i + 1, len(df_not)):
            if over[i, j]:
                G.add_edge(df_not.iloc[i]["id"], df_not.iloc[j]["id"], weight=cos_mat[i, j])

    filtered_components = [
        df_not[df_not["id"].isin(list(c))][["id", "space_id", "title", "body", "link", "Class A", "Class B", "Class C"]]
        for c in list(nx.connected_components(G)) 
        if len(c) >= 2
    ]
    filtered_components = sorted(filtered_components, key=lambda x: len(x), reverse=True)

    rows = 0
    output_file = f'df_not_{threshold}.xlsx'
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        for i, component in enumerate(filtered_components):
            component.to_excel(writer, sheet_name=str(i), index=False)
            rows += component.shape[0]
    print(f"Number of rows with threshold {threshold}: {rows}")
    return filtered_components

In [111]:
panel = [2, 1, 0.9, 0.8, 0.7, 0.6]
for i in range(len(panel)-1):
    filtered_components = find_similar_cluster(threshold=panel[i+1], threshold_upper=panel[i])
    print(panel[i+1], len(filtered_components))

Exception ignored in: <function ZipFile.__del__ at 0x0000028C61BF9A20>
Traceback (most recent call last):
  File "c:\Python310\lib\zipfile.py", line 1819, in __del__
    self.close()
  File "c:\Python310\lib\zipfile.py", line 1836, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


Number of rows with threshold 1: 1000
1 312
Number of rows with threshold 0.9: 3321
0.9 1068
Number of rows with threshold 0.8: 2768
0.8 636
Number of rows with threshold 0.7: 3671
0.7 802
