In [1]:
from FlagEmbedding import FlagModel

model = FlagModel("BAAI/bge-large-en", 
    query_instruction_for_retrieval="Represent this sentence for searching relevant passages: "
    )

In [2]:
queries = ['conflicting', 'inexperience', 'assumption', 'appreciation', 'genetics', 'client relations']

In [3]:
import pandas as pd

consol_pre = pd.read_csv('./data/pre_consolidated.csv')
sep_pre = consol_pre[['ID'] + list(consol_pre.columns[14:-2])]
sep_pre_col_names = ['ID', 'Q14', 'Q15', 'Q16', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25']
sep_pre.columns = sep_pre_col_names

consol_post = pd.read_csv('./data/post_consolidated.csv')
sep_post = consol_post[['ID'] + list(consol_post.columns[14:-3])]
sep_post_col_names = ['ID', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']
sep_post.columns = sep_post_col_names

pre_fr = sep_pre[['ID', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25']]
post_fr = sep_post[['ID', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']]

In [4]:
from nltk import sent_tokenize

all_responses = []
for col in pre_fr.columns[1:]:
    pre_sent = pre_fr[col].apply(sent_tokenize)
    pre_sent = pre_sent.explode()
    for fr in list(pre_sent):
        all_responses.append(fr)

for col in post_fr.columns[1:]:
    post_sent = post_fr[col].apply(sent_tokenize)
    post_sent = post_sent.explode()
    for fr in list(post_sent):
        all_responses.append(fr)

len(all_responses)

253

In [5]:
q_embeddings = model.encode_queries(queries)
p_embeddings = model.encode(all_responses)

# ZS

In [6]:
scores = q_embeddings @ p_embeddings.T

In [7]:
scores.shape

(6, 253)

In [8]:
scores[1].argsort()[::-1]

array([ 68,  74, 217, 229, 170,  51,  48, 206, 112,  85, 214,  66,  97,
       196, 209, 189, 228,   4,  94,  99, 220, 225, 208,  67,  11, 103,
       105, 222, 249,  81,  18,  70, 100,  96, 137, 125,   3, 211,  79,
       117, 161, 210, 204, 102,  31,  84, 111, 124,  15,  13, 190, 179,
       219, 166, 114, 139, 223, 216, 176,   5,  80,  75, 213,   0,  14,
        95,  88,  10, 231, 212,  65,  83,  22, 230,  53,  82, 226,   9,
       118, 250, 104, 101, 175,  93,  71, 158,   6,  77,  98,   7,  69,
       129, 198,  73,  54,  58,  12, 224,  63, 147, 136,  90, 168,  72,
       153, 113, 133, 164,  44, 163,  64,  91, 171,  24, 169, 199, 108,
       201, 246, 157,   1, 132, 110,  20, 187, 251,  52, 123,  92, 120,
        28,  86,  26, 215,  45, 119, 221,  43,  42,  47,  56,  39, 148,
       240, 151, 109,  38,  32, 202, 185, 154,  78, 116, 155, 200, 126,
       156,  30,  89, 245, 107, 197,  60, 182,  76, 106, 150,   8,  57,
       146, 130, 173, 135,  41, 165, 167, 252, 241, 178, 247, 12

In [9]:
i = 68
all_responses[i], scores[0][i]

('I have very little experience with canine generics and theriogenology.',
 0.70914245)

# TM

In [8]:
p_embeddings.shape 

(253, 1024)

In [20]:
import hdbscan
import numpy as np
 
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1, prediction_data=True).fit(p_embeddings)
soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
labels = [np.argmax(x) for x in soft_clusters]

In [28]:
df = pd.DataFrame(zip(all_responses, labels), columns=['text', 'labels'])
df

Unnamed: 0,text,labels
0,I don't know a lot about the canine breeder co...,2
1,One thing I do not understand is why we would ...,3
2,I know that breeders are very passionate about...,3
3,I think there are likely breeders that do not ...,3
4,I have very little experience with the canine ...,2
...,...,...
248,that were helpful in understanding her role an...,3
249,Would have loved to be more involved with the ...,3
250,I think a mock show would be a great way for f...,1
251,For whelping and puppies if the timing works o...,3


In [34]:
import re
import collections
from tqdm import tqdm
import math
import spacy
nlp = spacy.load('en_core_web_sm')

def _is_stop_span(span):
    doc = span.as_doc()
    bools = [token.is_stop for token in doc]
    
    if any(bools):
        return True
    else:
        return False

def create_labels(df):
    series_dict = {}

    label_list = list(set(df.labels.unique()))
    for label in tqdm(label_list): 
        spacy_docs = df.loc[df.labels == label].text.apply(nlp)
        flat = [y for x in spacy_docs.apply(lambda x: [re.sub('[^\w]','',chunk.text.lower()) for chunk in x.noun_chunks if not _is_stop_span(chunk)]).to_list() for y in x]
        freq = collections.defaultdict(int)
        for token in flat:
            freq[token] += 1
        freq = dict(freq)
        tf = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))

        idf = collections.defaultdict(int)
        for term in tf:
            for doc in spacy_docs:
                if term in doc.text.lower():    
                    idf[term] += 1
        idf = dict(idf)
        idf = dict(sorted(idf.items(), key=lambda item: item[1], reverse=True))

        tfidf_dict = {}
        for term, f in idf.items():
            if tf[term] > 1:
                tfidf_dict[term] = tf[term]*(math.log(len(spacy_docs)-f/f)+1) #smooth idf

        tfidf_dict = dict(sorted(tfidf_dict.items(), key=lambda item: item[1], reverse=True)[:5])

        key_string = ''
        for i, term in enumerate(tfidf_dict):
            if i < len(tfidf_dict)-1:
                key_string += f'{term}; '
            else:
                key_string += f'{term}'
        series_dict[f'label: {int(label)}'] = key_string

    return pd.DataFrame.from_dict(series_dict, orient='index', columns=['topics'])

In [35]:
tm = create_labels(df)
tm

100%|██████████| 4/4 [00:01<00:00,  2.10it/s]


Unnamed: 0,topics
label: 0,resources; therio; theriogenology; tufts
label: 1,dogs; people; terms; purpose; canine
label: 2,breeders; breeding; people
label: 3,breeders; breeding; people; veterinarians; inf...


In [36]:
tm.topics.iloc[-1]

'breeders; breeding; people; veterinarians; information'