In [5]:
import pandas as pd

df = pd.read_excel("proposals_preprocess_0812_cleaned.xlsx")
print(df.shape)
empty_body = pd.read_excel("empty_body.xlsx")
print(empty_body.shape)

(20750, 39)
(253, 8)


In [6]:
# Title has [Election, Candidate] 

temp = df["title"].str.contains("Multi-sig Governance")
candidate = df[temp]
print(candidate.shape)
print(candidate[candidate["body"].str.contains("multi-sig owners")].shape)

df.loc[temp, "Class A"] = "Operations"
df.loc[temp, "Class B"] = "Team"
df.loc[temp, "Class C"] = "Hiring"

(45, 39)
(45, 39)


In [7]:
temp1 = df["title"].str.contains("Multi-sig Governance") # 45 rows
temp2 = df["body_lower"].isnull()
df_ = df[~(temp1 | temp2)].sort_values(by=["body_lower"])

Remove rare words

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\32mou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-

In [14]:
for v in ["title", "body"]:
    df_[f"{v}_filtered"] = df_[f"{v}_lower"].astype(str) \
        .str.replace("[^a-z\n]", " ", regex=True).str.strip() \
            .apply(lambda x: ' '.join(
            (w for w in word_tokenize(x) if not w in stop_words)))

In [16]:
from pattern.en import lemma, singularize
from nltk.corpus import wordnet

def is_word_in_dictionary(word):
    for w in set([word, singularize(word), lemma(word)]):
        if len(wordnet.synsets(w)) > 0:
            return True
    return False
print(is_word_in_dictionary("apples"))  # True (복수형 단어가 기본형으로 변환됨)
print(is_word_in_dictionary("ate"))     # True (과거형으로 변환된 단어가 기본형으로 변환됨)
print(is_word_in_dictionary("qwerty"))  # False (사전에 없는 단어)

True
True
False


In [25]:
import numpy as np
from collections import Counter

documents = (
    item 
    for sub in (
        str(x).split() for x in df_["body_filtered"]
    ) 
    for item in sub
)

word_counts = Counter(documents)
print(len(word_counts))
single = set([
    word 
    for word, count in word_counts.items() 
    if count <= 1 and not is_word_in_dictionary(word)
])
print(len(single))
single

57668
14567


{'timelockdelay',
 'bulkremovevalidator',
 'rhinostake',
 'metaversedao',
 'hailey',
 'defieye',
 'heavely',
 'communtities',
 'magenes',
 'mojito',
 'medelln',
 'minterface',
 'casstoshi',
 'blocktools',
 'excessfeerefundaddress',
 'pgps',
 'memeish',
 'readdressed',
 'avantgardefi',
 'hhuh',
 'bntdapp',
 'savvio',
 'ottonamas',
 'hynix',
 'cdm',
 'optitrack',
 'apebot',
 'episdio',
 'semanas',
 'hhmi',
 'grosso',
 'cryptosavingexpert',
 'apolmaticx',
 'valueif',
 'cringy',
 'decentralandgame',
 'vanwitzenburg',
 'eran',
 'superpat',
 'impactfully',
 'upyfi',
 'relase',
 'collectivly',
 'spencecoin',
 'sokravtsov',
 'proxyoft',
 'blockhive',
 'phantabear',
 'cyclesnew',
 'intokens',
 'microclimate',
 'eso',
 'votants',
 'gtbrs',
 'musuka',
 'kqhozolk',
 'poiln',
 'klaapwqhf',
 'westbrook',
 'tigo',
 'temporature',
 'kalamata',
 'rostros',
 'discussionstypically',
 'dapposs',
 'cabinvc',
 'experienceproposal',
 'pitzalis',
 'balogh',
 'wpy',
 'gotchiguess',
 'valut',
 'shutterised',
 '

In [26]:
func = lambda x: ' '.join(
        xx for xx in x.split() if xx not in single
    ).strip()
func = np.vectorize(func)
df_["body_filtered"] = func(df_["body_filtered"].values)

# With Categorized dataset

In [27]:
class_cols = ["Class A", "Class B", "Class C"]

df_class = df_[df_[class_cols].notnull().any(axis=1)].sort_values(by=['body_filtered'])
df_not = df_[df_[class_cols].isnull().all(axis=1)].sort_values(by=['body_filtered'])
df_body = pd.concat([df_class["body_filtered"], df_not["body_filtered"]])
df_class.shape, df_not.shape

((264, 41), (20188, 41))

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df_body)
df_class_body = X[:df_class.shape[0]]
df_not_body = X[df_class.shape[0]:]

cos_mat = cosine_similarity(df_class_body, df_not_body)
print(cos_mat.shape)

(264, 20188)


In [31]:
columns = ["similarity", "id_c", "space_id_c", "title_c", "body_c", "link_c", "id_n", "space_id_n", "title_n", "body_n", "link_n"]
df_columns = ["id", "space_id", "title", "body", "link"]

def print_line(data):
    print("[ID]:", data["id"])
    print("[Space ID]:", data["space_id"])
    print("[Title]:", data["title"])
    print(data["body"])

def find_similar(threshold: int, threshold_upper: int=2, doPrint: bool = False):
    global cos_mat, df_class, df_not, columns, class_cols, df_columns

    temp = {k: None for k in columns + class_cols}
    result = []
    over = (cos_mat >= threshold) & (threshold_upper > cos_mat)

    for i in (i for i, v in enumerate(over.sum(axis=1)) if v > 0):
        y = df_class.iloc[i]
        if doPrint:
            print("-------------------------------------")
            print("-------------------------------------")
            print("[Category]:", y["Class A"], y["Class B"], y["Class C"], sep="\t")
            print_line(y)
            print("-------------------------------------")
        for v in df_columns:
            temp[v+"_c"] = y[v]
        for v in class_cols:
            temp[v] = y[v]

        
        for j in (idx for idx, v in enumerate(over[i]) if v): 
            x = df_not.iloc[j]
            if doPrint:
                print(f"{cos_mat[i, j]}")
                print_line(x)
                print("======================\n\n")
            temp["similarity"] = cos_mat[i, j]
            for v in df_columns:
                temp[v+"_n"] = x[v]
            result.append(temp.copy())
    return pd.DataFrame(result)

In [32]:
panel = [2, 1, 0.9, 0.8, 0.7, 0.6] # 0.6은 별로
df_panel = {}
for i in range(len(panel)-1):
    result = find_similar(threshold=panel[i+1], threshold_upper=panel[i])
    df_panel[f"over{panel[i+1]}"] = result
    print(result.shape)

(46, 14)
(61, 14)
(93, 14)
(58, 14)
(82, 14)


In [49]:
# before removing proper nouns
# (50, 14)
# (58, 14)
# (76, 14)
# (66, 14)
# (60, 14)

In [33]:
output_file = 'similar_categorized_filtered.xlsx'

with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
    for k in df_panel.keys():
        df_panel[k].to_excel(writer, sheet_name=k, index=False)

# Without Categorized Dataset 

In [51]:
!pip install -q networkx


[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
df_not = df_not[~df_not["id"].isin(df_panel["over1"]["id_n"])]
df_not = df_not[~df_not["id"].isin(df_panel["over0.9"]["id_n"])]

In [36]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_not["body_filtered"])
cos_mat = cosine_similarity(X, X)
cos_mat.shape

(20121, 20121)

In [37]:
import networkx as nx

def find_similar_cluster(threshold: int, threshold_upper:int=2):
    G = nx.Graph()
    for idx in range(len(df_not)):
        G.add_node(df_not.iloc[idx]["id"])

    over = (cos_mat >= threshold) & (threshold_upper > cos_mat) 
    for i in range(len(df_not)):
        for j in range(i + 1, len(df_not)):
            if over[i, j]:
                G.add_edge(df_not.iloc[i]["id"], df_not.iloc[j]["id"], weight=cos_mat[i, j])

    filtered_components = [
        df_not[df_not["id"].isin(list(c))][["id", "space_id", "title", "body", "link", "Class A", "Class B", "Class C"]]
        for c in list(nx.connected_components(G)) 
        if len(c) >= 2
    ]
    filtered_components = sorted(filtered_components, key=lambda x: len(x), reverse=True)

    rows = 0
    output_file = f'df_not_{threshold}_filtered.xlsx'
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        for i, component in enumerate(filtered_components):
            component.to_excel(writer, sheet_name=str(i), index=False)
            rows += component.shape[0]
    print(f"Number of rows with threshold {threshold}: {rows}")
    return filtered_components

In [38]:
panel = [2, 1, 0.9, 0.8, 0.7, 0.6]
for i in range(len(panel)-1):
    filtered_components = find_similar_cluster(threshold=panel[i+1], threshold_upper=panel[i])
    print(panel[i+1], len(filtered_components))

Number of rows with threshold 1: 1545
1 471
Number of rows with threshold 0.9: 3406
0.9 1061
Number of rows with threshold 0.8: 2733
0.8 613
Number of rows with threshold 0.7: 3708
0.7 778
Number of rows with threshold 0.6: 5130
0.6 931
