In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_DIR = './dataset/'

In [19]:
pangenome_nodes = pd.read_csv(DATASET_DIR + 'pangenome_nodes.csv')
hit_family_nodes = pangenome_nodes[pangenome_nodes['nodeLabels'] == 'HitFamily']
pfam_accessions = hit_family_nodes['accession'].unique()
print('Number of unique Pfam accessions:', len(pfam_accessions))

Number of unique Pfam accessions: 979


In [20]:
def get_pfam_basic(accession):
    acc_num = accession.split('PF')[1]
    url = 'https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=pfam' + acc_num
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    try:
        desc = soup.find_all('meta', attrs={'name': 'description'})[0]['content']
        desc = desc.split('Conserved Protein Domain Family')[1]
        return desc
    except:
        print('No description found for ' + accession)
        return None

In [29]:
# Write rows to CSV continuously and start from last written

import csv

fetch_pfams = sorted(pfam_accessions)
filename = DATASET_DIR + 'pfam_llm.csv'
try:
    pfam_descriptions = pd.read_csv(filename)
    last_written = pfam_descriptions['accession'].values[-1]
    last_written_index = fetch_pfams.index(last_written)
    fetch_pfams = fetch_pfams[last_written_index + 1:]
except:
    with open(filename, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['accession', 'description'])

print(len(fetch_pfams), 'Pfams to fetch')

for pfam in fetch_pfams:
    with open(filename, 'a', newline="") as f:
        writer = csv.writer(f)
        writer.writerow([pfam, get_pfam_basic(pfam)])


977 Pfams to fetch


In [74]:
taxon_nodes = pd.read_csv(DATASET_DIR + 'taxon_nodes.csv')
taxon_nodes = taxon_nodes.loc[
    taxon_nodes['taxKingdom'] == 'Viruses'
]
taxon_orders = taxon_nodes[taxon_nodes['rank'] == 'order']['taxOrder'].unique()
print('Number of unique orders:', len(taxon_orders))

Number of unique orders: 65


In [72]:
def get_taxon_order_description(order):
    # url = f'https://ictv.global/report/chapter/{str(order)}/{str(order)}'
    try:
        url = f'https://en.wikipedia.org/wiki/{order}'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        text = ''
        for i in soup.select('p'):
            text += i.text
        text = text.strip()
        text = text.replace('\n', ' ')
        return text
    except:
        print('No description found for ' + order)
        return None

test = get_taxon_order_description('Jingchuvirales')
print(test)

Jingchuvirales is an order of viruses. The order contains the following families:[1] This virus-related article is a stub. You can help Wikipedia by expanding it.


In [75]:
fetch_taxons = sorted(taxon_orders)
filename = DATASET_DIR + 'taxon_order_llm.csv'

try:
    taxon_order_descriptions = pd.read_csv(filename)
    last_written = taxon_order_descriptions['order'].values[-1]
    last_written_index = fetch_taxons.index(last_written)
    fetch_taxons = fetch_taxons[last_written_index + 1:]
except:
    with open(filename, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['taxId', 'order', 'description'])
    
print(len(fetch_taxons), 'Taxon orders to fetch')

for tax_order in fetch_taxons:
    with open(filename, 'a', newline="") as f:
        writer = csv.writer(f)
        tax_id = taxon_nodes[taxon_nodes['taxOrder'] == tax_order]['taxId'].values[0]
        writer.writerow([tax_id, tax_order, get_taxon_order_description(tax_order)])


65 Taxon orders to fetch
No description found for Crassvirales
No description found for Kirjokansivirales
No description found for Methanobavirales
No description found for Rivendellvirales
No description found for Rohanvirales
No description found for Thumleimavirales
No description found for Yadokarivirales


In [110]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [123]:
def add_embedding_to_df(df, text_column):
    embeddings = model.encode(df[text_column].values)

    df['embeddings'] = embeddings.tolist()
    return df


In [131]:
taxon_order_descriptions = pd.read_csv(DATASET_DIR + 'taxon_order_llm.csv')

taxon_order_descriptions = add_embedding_to_df(taxon_order_descriptions, 'description')
taxon_order_descriptions.to_csv(DATASET_DIR + 'taxon_order_llm.csv', index=False)

In [132]:
pfam_descriptions = pd.read_csv(DATASET_DIR + 'pfam_llm.csv')

pfam_descriptions = add_embedding_to_df(pfam_descriptions, 'description')
pfam_descriptions.to_csv(DATASET_DIR + 'pfam_llm.csv', index=False)

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# "microsoft/BioGPT-Large"
# "microsoft/biogpt"
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
def get_embedding(text):
    data = pipe(text, return_tensors="pt")
    return data[0].numpy().mean(axis=0)
    
out = get_embedding('This is a test.')
# 57717, 42384
print(out.shape)

In [159]:
import ast
import umap
import matplotlib.pyplot as plt

reducer = umap.UMAP()
emb = pfam_descriptions['embeddings'].values
# emb = [ast.literal_eval(i) for i in emb]
# emb = [ast.literal_eval(i) for i in emb]


In [155]:
# x = [
#     [-0.09007169306278229, -0.0182830598205328, -0.011229059658944607, -0.020990164950489998, -0.018167978152632713, -0.029136208817362785, 0.02786572277545929, 0.006597375031560659, 0.06018448621034622, 0.029049508273601532, 0.07114718109369278, 0.015373238362371922, -0.0787193551659584, 0.06565319001674652, 0.029917174950242043, 0.029419180005788803, -0.04353157803416252, 0.05311138555407524, -0.10769280791282654, -0.0366608165204525, 0.04530547559261322, 0.009813718497753143, -0.013552230782806873, 0.02602376416325569, -0.008414994925260544, 0.03227970749139786, -0.005851674824953079, 0.06465329229831696, 0.00849085208028555, -0.08870351314544678, 0.007133286911994219, -0.007167296949774027, 0.0055702016688883305, 0.053475260734558105, -0.005578153301030397, 0.07178371399641037, -0.009085070341825485, -0.011029617860913277, 0.05748431384563446, -0.04370546340942383, 0.12726010382175446, 0.08980897814035416, 0.061957597732543945, -0.053587839007377625, 0.02891572192311287, 0.0788295641541481, -0.02212286926805973, 0.05138879641890526, 0.029708942398428917, 0.07816470414400101, 0.03963668271899223, -0.042785223573446274, 0.042003460228443146, 0.05603288859128952, 0.0070531354285776615, 0.13252857327461243, -0.09593722224235535, -0.003529309295117855, 0.03730042278766632, 0.0007494673482142389, 0.01515804510563612, 0.003446441376581788, 0.0027485089376568794, -0.010264807380735874, 0.06833035498857498, -0.11056709289550781, 0.007214222569018602, -0.016741979867219925, 0.034134551882743835, 0.002109586726874113, -0.01790131814777851, -0.03842401131987572, 0.008325082249939442, 0.09902356564998627, 0.032508183270692825, 0.07970065623521805, -0.041475098580121994, -0.053715672343969345, -0.018058376386761665, -0.012621843256056309, 0.022119875997304916, 0.05397859588265419, 0.1299743354320526, 0.09370646625757217, 0.08319253474473953, -0.009342583827674389, 0.07761842757463455, -0.017875071614980698, 0.005681342910975218, -0.01418071798980236, -0.027636300772428513, 0.010558318346738815, 0.02098931558430195, -0.038722071796655655, -0.06885036826133728, 0.013281605206429958, 0.018386567011475563, -0.02371854893863201, -0.0713857039809227, -0.01162925735116005, 0.050236865878105164, -0.010231660678982735, 0.07055539637804031, -0.06307504326105118, 0.0029024137184023857, -0.0016772174276411533, -0.015373414382338524, 0.05083936080336571, -0.04164637252688408, 0.025545071810483932, 0.004649160895496607, 0.0016621702816337347, 0.06658342480659485, -0.010138028301298618, 0.0068166619166731834, 0.08669695258140564, -0.044760435819625854, 0.002548184245824814, 0.053948450833559036, -0.053775254637002945, -0.044176168739795685, 0.006266195792704821, 0.009124808944761753, 0.011715631932020187, -0.09267570078372955, 0.03859366104006767, -0.061347562819719315, -1.2967813069271542e-33, -0.03342702239751816, 0.0003824024461209774, -0.11770021170377731, 0.09608360379934311, -0.04062869772315025, -0.08207226544618607, -0.06250183284282684, 0.11796996742486954, 0.0027693475130945444, 0.04158516973257065, -0.022778987884521484, 0.0378594696521759, 0.06576921790838242, -0.029770806431770325, -0.018815338611602783, -0.034430939704179764, 0.01924777589738369, 0.04338790476322174, 0.0021901780273765326, 0.025429168716073036, -0.026788195595145226, 0.07310685515403748, 0.019704990088939667, -0.027297845110297203, 0.06985171884298325, 0.050694167613983154, -0.011317635886371136, -0.09155616909265518, 0.03770038112998009, -0.016978275030851364, -0.0071046482771635056, 0.0029811118729412556, -0.11926315724849701, -0.030487483367323875, 0.05690942704677582, 0.030832869932055473, 0.03834203630685806, -0.09346535801887512, -0.04201970249414444, -0.017982831224799156, -0.05771403759717941, -0.0005488489405252039, -0.09172861278057098, 0.018121417611837387, 0.03256521373987198, -0.03195904567837715, -0.03084508515894413, -0.05222352594137192, -0.009622800163924694, -0.0013407198712229729, 0.016553914174437523, -0.034242648631334305, 0.018454670906066895, -0.05039755254983902, -0.12142517417669296, -0.022665761411190033, -0.04437453672289848, -0.03566940501332283, 0.047538354992866516, 0.031981997191905975, -0.004775371868163347, 0.03142274543642998, -0.15339858829975128, 0.052186861634254456, 0.0013003114145249128, 0.028524018824100494, -0.07663053274154663, -0.030956335365772247, 0.022450312972068787, 0.06342127174139023, -0.030243489891290665, -0.06149253994226456, 0.04594476521015167, -0.03947209194302559, 0.02512623369693756, 0.020077193155884743, -0.08099739998579025, -0.020638583227992058, -0.0032492545433342457, 0.02425280772149563, 0.01392350159585476, -0.015528501011431217, -0.006276826839894056, -0.022727850824594498, -0.06458652019500732, -0.10660179704427719, 0.028659701347351074, -0.03129409998655319, 0.025558296591043472, -0.009415483102202415, 0.06488518416881561, -0.014262603595852852, -0.007536023855209351, 0.010131933726370335, -0.022182151675224304, -1.8696611879475327e-34, 0.06727989763021469, -0.12080616503953934, 0.021112050861120224, -0.02792743220925331, -0.0766092985868454, 0.058214377611875534, 0.024095872417092323, -0.08877220004796982, 0.02521856687963009, -0.07859579473733902, 0.024468060582876205, -0.03325696289539337, 0.024517539888620377, -0.08947516232728958, 0.012861235067248344, 0.04000258073210716, -0.10951203852891922, 0.009989777579903603, 0.05139520391821861, 0.04498985409736633, -0.007672887295484543, 0.09956339001655579, 0.09989739954471588, 0.02480258420109749, -0.022187665104866028, 0.012103104032576084, 0.030835574492812157, -0.0017233056714758277, 0.12591716647148132, 0.04603947699069977, -0.09428980201482773, 0.0012982054613530636, 0.04267088696360588, -0.028346844017505646, -0.11160722374916077, 0.03403951972723007, -0.0381498821079731, 0.0075828442350029945, -0.023292211815714836, -0.10603507608175278, 0.03315809741616249, -0.008302988484501839, -0.10348931699991226, 0.09874947369098663, 0.030533665791153908, 0.05208178237080574, -0.05186915397644043, 0.051270741969347, -0.030190467834472656, 0.031414009630680084, 0.03148118034005165, -0.06376402825117111, -0.06542826443910599, -0.033746927976608276, -0.06293701380491257, 0.04412844404578209, 0.04134058579802513, -0.07988781481981277, -0.04877236485481262, 0.04429594427347183, -0.018416278064250946, -0.00963942613452673, 0.025168420746922493, -0.0051424275152385235, -0.02283284068107605, 0.07418999820947647, 0.008624203503131866, -0.0004921711515635252, 0.007512078154832125, 0.04042620211839676, 0.11346863955259323, 0.019969066604971886, 0.024256009608507156, -0.027235306799411774, 0.012380125001072884, -0.0055256993509829044, -0.030872177332639694, -0.04563901945948601, -0.01383688859641552, -0.006928962655365467, -0.05141204595565796, 0.04693682864308357, 0.010561543516814709, 0.051836587488651276, 0.06686779856681824, -0.012764801271259785, -0.01956627517938614, 0.08586455136537552, -0.06363235414028168, 0.026358289644122124, 0.036193449050188065, -0.10753319412469864, 0.04605087265372276, 0.10815135389566422, -0.004435067996382713, -2.3239966395749434e-08, 0.03884190320968628, -0.021691732108592987, 0.05208723619580269, -0.09156342595815659, -0.027292873710393906, 0.009448466822504997, -0.04119212180376053, 0.05148304998874664, 0.02262088842689991, 0.02819312922656536, 0.009718606248497963, 0.05629540979862213, -0.03456960991024971, 0.055446282029151917, 0.05816806107759476, 0.06930436939001083, -0.015903135761618614, 0.0028640958480536938, -0.04067258536815643, -0.03922021761536598, -0.08049102872610092, 0.0007689022459089756, 0.0019430327229201794, 0.07443679869174957, -0.01693020388484001, -0.0199452992528677, -0.012862611562013626, 0.03432939574122429, -0.05477883666753769, -0.045893166214227676, -0.0020638955757021904, 0.054258015006780624, 0.02695688232779503, 0.01288705412298441, 0.0249076709151268, 0.036524515599012375, 0.031960126012563705, -0.023419233039021492, -0.01913539133965969, -0.042368099093437195, 0.007343108765780926, -0.029085803776979446, -0.08661174029111862, -0.013648767955601215, -0.0009287151042371988, -0.00953744538128376, 0.03306378051638603, -0.038353171199560165, -0.03592788055539131, 0.051173947751522064, 0.006334523670375347, 0.022370917722582817, -0.041001833975315094, -0.004343647044152021, -0.10088328272104263, -0.0027154232375323772, -0.03203227370977402, -0.1299305260181427, -0.019273728132247925, -0.00439508818089962, 0.08482155203819275, -0.019124243408441544, 0.10718679428100586, -0.11500279605388641],
#     [-0.016362972557544708, 0.007456708233803511, -0.012515182606875896, -0.009106168523430824, 0.004233706276863813, -0.005323588382452726, 0.01613144762814045, 0.015863653272390366, 0.02018481306731701, -0.031852997839450836, 0.055063940584659576, -0.03619133681058884, -0.06211685389280319, 0.050112709403038025, -0.01292601227760315, -0.04305385798215866, -0.08218736201524734, 0.04394517093896866, -0.00021984901104588062, -0.05203979089856148, 0.008600826375186443, 0.04224924370646477, 0.016661962494254112, 0.014287199825048447, -0.019937360659241676, 0.0757220983505249, 0.045269351452589035, -0.0047583263367414474, -0.028713390231132507, -0.023191537708044052, -0.12691351771354675, 0.022487405687570572, -0.023499291390180588, -0.05291758477687836, 0.0137752341106534, 0.03955795615911484, -0.07175743579864502, -0.03477102145552635, 0.09512890875339508, 0.0030084180179983377, 0.09748794883489609, -0.0322720929980278, 0.06912007927894592, 0.05278467386960983, -0.08071133494377136, 0.055483315140008926, 0.010935302823781967, 0.06558562815189362, 0.03424913063645363, 0.0034760211128741503, -0.022812584415078163, -0.02731521800160408, -0.0586690716445446, 0.07792764902114868, 0.09157261252403259, 0.05853673815727234, -0.07237035036087036, 0.003341715084388852, -0.029291225597262383, 0.03429422155022621, -0.0227765291929245, 0.04081224277615547, -0.056158848106861115, -0.0035184414591640234, 0.09638296812772751, -0.0900377705693245, -0.002986496314406395, -0.01848519593477249, -0.012578628025949001, 0.012762791477143764, -0.049404699355363846, -0.08295811712741852, -0.05297832936048508, -0.004174712114036083, -0.02531346306204796, 0.044748689979314804, 0.02270904742181301, -0.026426995173096657, -9.823262371355668e-05, -0.03957414627075195, 0.044044800102710724, 0.033261463046073914, -0.006649347022175789, 0.02568094991147518, 0.040719229727983475, -0.053565725684165955, 0.07749580591917038, 0.09075231850147247, 0.05780600756406784, 0.04829121753573418, -0.013189376331865788, -0.05738937109708786, -0.033467091619968414, -0.07368195056915283, -0.08544236421585083, 0.00043851046939380467, -0.020087741315364838, 0.012433593161404133, -0.031945791095495224, -0.03862719610333443, 0.026616714894771576, 0.05004725605249405, 0.05364111810922623, -0.03373284265398979, -0.03186117485165596, -0.013934195041656494, -0.020975030958652496, 0.06875719130039215, -0.022145280614495277, 0.07854143530130386, 0.03657560795545578, -0.00708639295771718, 0.023886732757091522, -0.07888749241828918, 0.03603675961494446, 0.09720485657453537, -0.05858072265982628, 0.01880054362118244, -0.02088831178843975, 0.03136400878429413, -0.015587672591209412, -0.08029187470674515, -0.0388726070523262, 0.03914063423871994, -0.021805640310049057, 0.043190278112888336, -0.15442441403865814, -7.2063419445996285e-34, -0.020999925211071968, -0.008215491659939289, 0.002866010880097747, 0.004323762841522694, -0.009188947267830372, 0.005904492922127247, -0.05681636929512024, -0.030865542590618134, -0.08886361867189407, -0.004717838019132614, -0.05849289149045944, -0.06333950906991959, 0.04056360945105553, 0.014132948592305183, -0.02386675961315632, -0.032710395753383636, -0.029998956248164177, 0.014806155115365982, -0.004082476254552603, -0.023546354845166206, 0.019165286794304848, 0.09822506457567215, -0.008714732713997364, -0.05502675473690033, 0.045867711305618286, -0.01168668270111084, -0.061587560921907425, -0.009676474146544933, 0.043757613748311996, -0.003070789622142911, 0.025704657658934593, -0.04803096130490303, -0.06300441920757294, 0.003805931657552719, 0.011253890581429005, -0.09086726605892181, -0.08446428924798965, -0.09465063363313675, 0.042282603681087494, -0.046040236949920654, -0.020890675485134125, 0.029226204380393028, -0.04150696471333504, -0.05135106295347214, 0.04719895124435425, -0.09993110597133636, 0.03295685350894928, 0.08182254433631897, 0.05041754990816116, 0.05179336667060852, 0.057182278484106064, -0.06114133819937706, 0.09760744869709015, -0.05839579552412033, -0.06889999657869339, -0.004091305658221245, -2.5360481231473386e-05, 0.07307478785514832, -0.03586595878005028, 0.07455389946699142, 0.045875705778598785, 0.07234427332878113, -0.04082594811916351, -0.00978802889585495, 0.014843211509287357, 0.11401336640119553, -0.06854035705327988, -0.09039682894945145, 0.03858676552772522, 0.04000614956021309, -0.024897364899516106, -0.0438631996512413, 0.08305498957633972, 0.03548611328005791, -0.010675940662622452, 0.013204372487962246, -0.0049269902519881725, 0.041019994765520096, -0.0007290495559573174, 0.02078789845108986, -0.04273414984345436, 0.035792723298072815, 0.001470079063437879, 0.04308660328388214, -0.053111106157302856, -0.05076247453689575, 0.012349364347755909, 0.005252177361398935, -0.04495854675769806, -0.03944898024201393, 0.07922854274511337, 0.032868895679712296, -0.08899694681167603, 0.08052655309438705, 0.016395309939980507, -9.225808637711182e-34, 0.05720609799027443, -0.078212670981884, 0.080880306661129, -0.1913556009531021, -0.08990537375211716, -0.016175197437405586, 0.02237738110125065, -0.057209983468055725, 0.030757291242480278, -0.005215703044086695, 0.026432787999510765, 0.011961792595684528, -0.03446947783231735, -0.05191630497574806, -0.05915861576795578, -0.0623130239546299, -0.03328746557235718, 0.027694979682564735, 0.0018855984089896083, 0.030059292912483215, 0.01196423452347517, 0.08767600357532501, 0.06391273438930511, 0.04009155556559563, -0.06490280479192734, -0.046492960304021835, 0.08565135300159454, 0.07532362639904022, 0.042382918298244476, -0.032718658447265625, -0.005273687187582254, 0.030581220984458923, -0.024904534220695496, -0.009324455633759499, -0.07966025173664093, -0.027140412479639053, 0.043412256985902786, 0.05610845237970352, -0.009306230582296848, 0.010646414011716843, 0.056695356965065, 0.07339813560247421, 0.06286418437957764, 0.055989593267440796, 0.0903298407793045, 0.009731406345963478, 0.003981180023401976, 0.026627205312252045, -0.11859742552042007, -0.04107313230633736, -0.05167406052350998, -0.0589325837790966, 0.01864398457109928, -0.004233566578477621, 0.016624528914690018, 0.008101142011582851, -0.006813118699938059, -0.0016876563895493746, 0.08907347917556763, -0.07421815395355225, -0.03144174441695213, -0.07511047273874283, 0.10599086433649063, 0.04036732390522957, 0.03905963525176048, 0.01783601939678192, -0.05705977603793144, 0.0044901929795742035, -0.007656110916286707, 0.06240106374025345, 0.005363810807466507, 0.06957724690437317, -0.043832067400217056, 0.001011812244541943, -0.002950794994831085, 0.018899377435445786, -0.028763825073838234, -0.020844044163823128, 0.02780652604997158, -0.05035984516143799, -0.07093503326177597, 0.0018014851957559586, -0.11669867485761642, 0.016697930172085762, 0.037926118820905685, -0.024218671023845673, 0.0005438703810796142, 0.09003905206918716, 0.02720792032778263, 0.01309890765696764, 0.0029644491150975227, -0.005739590618759394, 0.02838529273867607, 0.07078244537115097, 0.014020105823874474, -2.216291150602956e-08, 0.050827302038669586, -0.07219550758600235, 0.04360320046544075, -0.0014456778299063444, 0.063474640250206, 0.14070117473602295, 0.04515455663204193, 0.06107436865568161, 0.09307245910167694, 0.0569881871342659, -0.014151413924992085, -0.01548247504979372, 0.03239869698882103, -0.03810623660683632, 0.01619674824178219, 0.09451426565647125, -0.06691993027925491, 0.026814818382263184, -0.015685997903347015, -0.0401998832821846, 0.04962964728474617, -0.02580789476633072, -0.013721141032874584, 0.0955582708120346, 0.028087114915251732, -0.043021753430366516, 0.009557556360960007, 0.0068112947046756744, -0.10647843033075333, -0.07720907777547836, 0.041259124875068665, 0.019584843888878822, 0.06700707972049713, -0.0625607892870903, -0.04183501377701759, 0.04857727885246277, 0.0054989829659461975, 0.010886363685131073, -0.03929752856492996, 0.03387024253606796, 0.0010798785369843245, -0.052008651196956635, -0.043622344732284546, -0.00910943653434515, 0.07470351457595825, 0.03608555719256401, -0.017969688400626183, 0.021895650774240494, 0.00807775929570198, -0.08252491801977158, 0.06512986123561859, 0.0011640515876933932, -0.03340568765997887, -0.10820312052965164, -0.09297367185354233, -0.0016254298388957977, -0.06305309385061264, 0.014417656697332859, 0.016874132677912712, 0.0010129234287887812, 0.09071899205446243, 0.04080946743488312, 0.08298735320568085, -0.007816317491233349],
# ]

In [156]:
umap_emb = reducer.fit_transform(x)

plt.scatter(
    umap_emb[:, 0],
    umap_emb[:, 1],
    s=10,
    alpha=0.5,
)

  warn(


Disconnection_distance = inf has removed 0 edges.
It has fully disconnected 2 vertices.
You might consider using find_disconnected_points() to find and remove these points from your data.
Use umap.utils.disconnected_vertices() to identify them.
  warn(


ValueError: zero-size array to reduction operation maximum which has no identity