In [1]:
import re
import spacy
import polars as pl
import numpy as np

from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

pl.Config.set_tbl_cols(10)
pl.Config.set_tbl_rows(100)

polars.config.Config

### Import and Preprocessing

In [2]:
dataset = pl.read_csv("../../data/eval_dataset_2000.tsv", separator="\t")
dataset.head()

filename,patient_id,finding,anatomic_classification,possible_secondary,autogenerated,labeled
str,str,str,str,str,bool,bool
"""/home/khans24/charit/anatomy_n…","""p10394761""","""PA and lateral chest views wer…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""Analysis is performed in direc…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is mild cardiac enlargem…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is a relative prominenc…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""The thoracic aorta is general…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True


In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
def preprocess_text(text: str) -> str:
    """ Perform text preprocessing on the corpus of text in the given column of the dataframe. """
    # Remove punctuation, replace with empty space and then replace all double or more spaces with one space
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text
    # doc = nlp(text)
    # result = []
    # for sentence in doc.sentences:
    #     for word in sentence.words:
    #         if word.text.lower() not in stop_words and word.upos in ["NOUN", "VERB", "ADJ", "ADV"]:
    #             result.append(word.lemma.lower())

dataset = dataset.with_columns(
    pl.col("finding").map_elements(lambda s: preprocess_text(s), return_dtype=pl.String)
)

In [5]:
doc = list(nlp.pipe(dataset["finding"], disable=["tok2vec", "parser", "senter", "ner"]))

In [6]:
allsentences = []
for sentence in doc:
    allsentences.append(
        ' '.join([token.lemma_ for token in sentence if token.text not in STOP_WORDS])
    )

In [7]:
dataset = dataset.with_columns(pl.Series("preprocessed_finding", allsentences))

### Classification

In [8]:
dataset = dataset.with_columns(
    pl.when(
        (pl.col("anatomic_classification") == "BOWEL") |
        (pl.col("anatomic_classification") == "SPLEEN") |
        (pl.col("anatomic_classification") == "GALLBLADDER") |
        (pl.col("anatomic_classification") == "STOMACH")
    )
        .then(pl.lit("ABDOMEN"))
        .otherwise(pl.col("anatomic_classification"))
    .alias("anatomic_classification")
)

dataset.select(
    pl.col("anatomic_classification").value_counts(sort=True)
).unnest("anatomic_classification")

anatomic_classification,count
str,u32
"""LUNG/PLEURA/LARGE AIRWAYS""",921
"""LINES/TUBES/DRAINS""",439
"""CARDIAC/CARDIOMEDIASTINALSILHO…",329
"""MISCELLANEOUS""",292
"""BONE AND SOFT TISSUE""",124
"""POSITIONING/LIMITATIONS""",99
"""ABDOMEN""",22
"""NECK""",4


In [9]:
le = LabelEncoder()
le.fit(dataset.filter(( pl.col("anatomic_classification") != "NECK" ) & (pl.col("anatomic_classification") != "ABDOMEN"))["anatomic_classification"])

In [10]:
le.classes_

array(['BONE AND SOFT TISSUE', 'CARDIAC/CARDIOMEDIASTINALSILHOUETTE',
       'LINES/TUBES/DRAINS', 'LUNG/PLEURA/LARGE AIRWAYS', 'MISCELLANEOUS',
       'POSITIONING/LIMITATIONS'], dtype='<U35')

In [12]:
encoder = TfidfVectorizer()
kf = KFold(n_splits=10, shuffle=True, random_state=1026)
filterset = dataset.filter(( pl.col("anatomic_classification") != "NECK" ) & (pl.col("anatomic_classification") != "ABDOMEN"))
clf = LogisticRegression()
accuracies = []
reports = []
for tridx, tsidx in kf.split(filterset, groups=filterset["anatomic_classification"]):
    train = filterset.select(
        pl.col("anatomic_classification").gather(tridx),
        pl.col("preprocessed_finding").gather(tridx)
    ).with_columns(
        pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
        .alias("anatomic_classification")
    )
    test = filterset.select(
        pl.col("anatomic_classification").gather(tsidx),
        pl.col("preprocessed_finding").gather(tsidx)
    ).with_columns(
        pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
        .alias("anatomic_classification")
    )

    train_encodings = encoder.fit_transform(train["preprocessed_finding"])
    test_encodings = encoder.transform(test["preprocessed_finding"])
    y_true = test["anatomic_classification"]

    clf.fit(train_encodings, train["anatomic_classification"])
    y_pred = clf.predict(test_encodings)
    y_probas = clf.predict_proba(test_encodings)
    accuracies.append(accuracy_score(y_true, y_pred))
    reports.append(classification_report(y_true, y_pred, target_names=le.classes_))

    print(classification_report(y_true, y_pred, target_names=le.classes_))


                                     precision    recall  f1-score   support

               BONE AND SOFT TISSUE       1.00      0.56      0.72        16
CARDIAC/CARDIOMEDIASTINALSILHOUETTE       0.76      0.79      0.77        28
                 LINES/TUBES/DRAINS       0.90      0.86      0.88        50
          LUNG/PLEURA/LARGE AIRWAYS       0.78      0.98      0.86        85
                      MISCELLANEOUS       0.85      0.53      0.65        32
            POSITIONING/LIMITATIONS       0.75      0.60      0.67        10

                           accuracy                           0.81       221
                          macro avg       0.84      0.72      0.76       221
                       weighted avg       0.83      0.81      0.81       221

                                     precision    recall  f1-score   support

               BONE AND SOFT TISSUE       0.91      0.71      0.80        14
CARDIAC/CARDIOMEDIASTINALSILHOUETTE       0.92      0.82      0.87      

In [13]:
np.mean(np.array(accuracies))

0.8575771287535993