I downloaded the bioinformatics files from the [stackexchange archives](https://archive.org/download/stackexchange) and pulled out the 'posts.xml' file, renaming it to 'bioinformatics_Posts.xml.

In [1]:
import os
xml_files = [f for f in os.listdir() if f.endswith('xml')]
xml_files

['bioinformatics_Posts.xml']

In [2]:
import xml.etree.ElementTree as ET
import html # for html.unescape()
import re
from bs4 import BeautifulSoup # for soup.get_text()
import pandas as pd

def safe_unescape(html_raw):
    if str(type(html_raw)) == "<class 'NoneType'>":
        html_raw = ''
    return html.unescape(html_raw)


def list_tags(tags_str):
    return ';'.join(tags_str[1:-1].split('><'))


def get_html_text(html_raw):
    html_str = safe_unescape(html_raw)
    soup = BeautifulSoup(html_str, 'html.parser')
    html_txt = soup.get_text()
    return html_txt


In [3]:
def extract_posts(xml_file):
    topic = xml_file.split('_')[0]
    row_count = 0

    with open(xml_file, encoding='utf-8') as xml_file_handle:
        csv_file = xml_file.replace('.xml', '.csv')
        # print(csv_file)
        data_rows = []
        for line in xml_file_handle:
                if re.match('  <row ', line):
                    row_count += 1
                    row_root = ET.fromstring(line)

                    title = get_html_text( row_root.attrib.get('Title') )
                    body = get_html_text( row_root.attrib.get('Body') )
                    tags = list_tags(safe_unescape( row_root.attrib.get('Tags') ) )
                    last_activity_date = row_root.attrib.get('LastActivityDate')
                    row_dict = {
                        'topic':topic, 
                        'title':title, 
                        'body':body,
                        'text': title + ': ' + body,
                        'tags':tags, 
                        'last_activity_date':last_activity_date,
                    }
                    if (len(row_dict['tags']) > 0):
                        data_rows.append(row_dict)
                
        return pd.DataFrame(data_rows) #.to_csv(csv_file, index=False)


In [4]:
post_data = extract_posts('bioinformatics_Posts.xml')
post_data

  soup = BeautifulSoup(html_str, 'html.parser')


Unnamed: 0,topic,title,body,text,tags,last_activity_date
0,bioinformatics,What's the most efficient file format for the ...,I'd like to learn which format is most commonl...,What's the most efficient file format for the ...,human-genome|storage|file-formats,2022-02-23T16:01:18.060
1,bioinformatics,Accuracy of the original human DNA datasets se...,The Human Genome Project was the project of 'd...,Accuracy of the original human DNA datasets se...,hgp|phylogenetics,2017-12-28T11:43:48.257
2,bioinformatics,Mapping drug names to ATC codes,I'm interested working with the medication inf...,Mapping drug names to ATC codes: I'm intereste...,drugs|ontology,2017-05-16T19:36:42.037
3,bioinformatics,What are the optimal parameters for docking a ...,I'm looking to dock a large ligand (~90kDa) to...,What are the optimal parameters for docking a ...,proteins|docking,2017-05-18T00:01:22.427
4,bioinformatics,"What is the difference between FASTA, FASTQ, a...",I'd like to learn the differences between 3 co...,"What is the difference between FASTA, FASTQ, a...",fasta|fastq|file-formats|sam,2022-08-18T18:58:12.410
...,...,...,...,...,...,...
6088,bioinformatics,How to do post hoc comparisons after a repeate...,I have a data set of several samples with thei...,How to do post hoc comparisons after a repeate...,statistics|modelling|graphs|lme4,2024-04-01T17:10:57.933
6089,bioinformatics,How do I get the gene annotations as a text fi...,Also posted on biostars\nI have a genbank file...,How do I get the gene annotations as a text fi...,gene|sequence-annotation,2024-03-29T17:59:05.480
6090,bioinformatics,"I'm trying to run aTRAM tool for assembly, but...",Code:\n$ python3 atram.py --max-processes=12 \...,"I'm trying to run aTRAM tool for assembly, but...",ngs|assembly,2024-04-04T20:50:57.923
6091,bioinformatics,Dante Labs VCF: UPDATE: SOLVED,Has anyone tried to analyse the VCF files prov...,Dante Labs VCF: UPDATE: SOLVED: Has anyone tri...,assembly|chromosomes,2024-03-30T15:55:38.210


In [5]:
post_data.to_csv('bioinformatics_posts.csv', index=False)

In [6]:
# %pip install sentence-transformers

from sentence_transformers import SentenceTransformer
sentxformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

post_data['vector'] = sentxformer.encode(post_data['text'].values).tolist()
    
post_data.to_parquet('bioinformatics_posts_feaurized.parquet')



In [None]:
post_data

Unnamed: 0,topic,title,body,text,tags,last_activity_date,vector
0,bioinformatics,What's the most efficient file format for the ...,I'd like to learn which format is most commonl...,What's the most efficient file format for the ...,human-genome|storage|file-formats,2022-02-23T16:01:18.060,"[0.01382493320852518, -0.001170836272649467, -..."
1,bioinformatics,Accuracy of the original human DNA datasets se...,The Human Genome Project was the project of 'd...,Accuracy of the original human DNA datasets se...,hgp|phylogenetics,2017-12-28T11:43:48.257,"[0.04499347507953644, 0.06613805145025253, -0...."
2,bioinformatics,Mapping drug names to ATC codes,I'm interested working with the medication inf...,Mapping drug names to ATC codes I'm interested...,drugs|ontology,2017-05-16T19:36:42.037,"[-0.012040533125400543, 0.025775786489248276, ..."
3,bioinformatics,What are the optimal parameters for docking a ...,I'm looking to dock a large ligand (~90kDa) to...,What are the optimal parameters for docking a ...,proteins|docking,2017-05-18T00:01:22.427,"[0.00028346164617687464, -0.06295959651470184,..."
4,bioinformatics,"What is the difference between FASTA, FASTQ, a...",I'd like to learn the differences between 3 co...,"What is the difference between FASTA, FASTQ, a...",fasta|fastq|file-formats|sam,2022-08-18T18:58:12.410,"[0.009203458204865456, -0.10056201368570328, 0..."
...,...,...,...,...,...,...,...
6088,bioinformatics,How to do post hoc comparisons after a repeate...,I have a data set of several samples with thei...,How to do post hoc comparisons after a repeate...,statistics|modelling|graphs|lme4,2024-04-01T17:10:57.933,"[-0.022957181558012962, -0.0423324778676033, 4..."
6089,bioinformatics,How do I get the gene annotations as a text fi...,Also posted on biostars\nI have a genbank file...,How do I get the gene annotations as a text fi...,gene|sequence-annotation,2024-03-29T17:59:05.480,"[-0.021438077092170715, -0.06102913245558739, ..."
6090,bioinformatics,"I'm trying to run aTRAM tool for assembly, but...",Code:\n$ python3 atram.py --max-processes=12 \...,"I'm trying to run aTRAM tool for assembly, but...",ngs|assembly,2024-04-04T20:50:57.923,"[0.0094567546620965, -0.04444451630115509, -0...."
6091,bioinformatics,Dante Labs VCF: UPDATE: SOLVED,Has anyone tried to analyse the VCF files prov...,Dante Labs VCF: UPDATE: SOLVED Has anyone trie...,assembly|chromosomes,2024-03-30T15:55:38.210,"[0.03195073455572128, -0.06578633189201355, 0...."
