In [92]:
import re
import spacy
import polars as pl
import numpy as np

from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

from tqdm.notebook import tqdm

pl.Config.set_tbl_cols(10)
pl.Config.set_tbl_rows(100)

polars.config.Config

### Import and Preprocessing

In [2]:
dataset = pl.read_csv("../../data/eval_dataset_2000.tsv", separator="\t")
dataset.head()

filename,patient_id,finding,anatomic_classification,possible_secondary,autogenerated,labeled
str,str,str,str,str,bool,bool
"""/home/khans24/charit/anatomy_n…","""p10394761""","""PA and lateral chest views wer…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""Analysis is performed in direc…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is mild cardiac enlargem…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is a relative prominenc…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""The thoracic aorta is general…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True


In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
def preprocess_text(text: str) -> str:
    """ Perform text preprocessing on the corpus of text in the given column of the dataframe. """
    # Remove punctuation, replace with empty space and then replace all double or more spaces with one space
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text
    # doc = nlp(text)
    # result = []
    # for sentence in doc.sentences:
    #     for word in sentence.words:
    #         if word.text.lower() not in stop_words and word.upos in ["NOUN", "VERB", "ADJ", "ADV"]:
    #             result.append(word.lemma.lower())

dataset = dataset.with_columns(
    pl.col("finding").map_elements(lambda s: preprocess_text(s), return_dtype=pl.String)
)

In [5]:
doc = list(nlp.pipe(dataset["finding"], disable=["tok2vec", "parser", "senter", "ner"]))

In [6]:
allsentences = []
for sentence in doc:
    allsentences.append(
        ' '.join([token.lemma_ for token in sentence if token.text not in STOP_WORDS])
    )

In [7]:
dataset = dataset.with_columns(pl.Series("preprocessed_finding", allsentences))

### Classification

One question that is relevant to the user experience and the goal of the project is the number of corrections or "moves."
If a normal user is correcting the AI output and moving through each of the tabs where the sentences have been sent to, how many would they fix?
This assumes that a user opens each tab and moves the sentences to the right tab and goes through each of the tabs in sequence.
This means that "false positives" are the corrections the user needs to make, as sentences that don't belong will be moved from one anatomic tab to the correct one.
The user will not be looking for "false negatives" or "missing" sentences for a particular class.

In [8]:
dataset = dataset.with_columns(
    pl.when(
        (pl.col("anatomic_classification") == "BOWEL") |
        (pl.col("anatomic_classification") == "SPLEEN") |
        (pl.col("anatomic_classification") == "GALLBLADDER") |
        (pl.col("anatomic_classification") == "STOMACH")
    )
        .then(pl.lit("ABDOMEN"))
        .otherwise(pl.col("anatomic_classification"))
    .alias("anatomic_classification")
)

dataset.select(
    pl.col("anatomic_classification").value_counts(sort=True)
).unnest("anatomic_classification")

anatomic_classification,count
str,u32
"""LUNG/PLEURA/LARGE AIRWAYS""",921
"""LINES/TUBES/DRAINS""",439
"""CARDIAC/CARDIOMEDIASTINALSILHO…",329
"""MISCELLANEOUS""",292
"""BONE AND SOFT TISSUE""",124
"""POSITIONING/LIMITATIONS""",99
"""ABDOMEN""",22
"""NECK""",4


In [9]:
le = LabelEncoder()
le.fit(dataset.filter(( pl.col("anatomic_classification") != "NECK" ) & (pl.col("anatomic_classification") != "ABDOMEN"))["anatomic_classification"])

In [10]:
le.classes_

array(['BONE AND SOFT TISSUE', 'CARDIAC/CARDIOMEDIASTINALSILHOUETTE',
       'LINES/TUBES/DRAINS', 'LUNG/PLEURA/LARGE AIRWAYS', 'MISCELLANEOUS',
       'POSITIONING/LIMITATIONS'], dtype='<U35')

In [108]:
encoder = TfidfVectorizer()
kf = KFold(n_splits=10, shuffle=True, random_state=1026)
filterset = dataset.filter(( pl.col("anatomic_classification") != "NECK" ) & (pl.col("anatomic_classification") != "ABDOMEN"))
clf = LogisticRegression()
firstrows = []
for idx, (tridx, tsidx) in enumerate(kf.split(filterset, groups=filterset["anatomic_classification"])):
    train = filterset.select(
        pl.col("anatomic_classification").gather(tridx),
        pl.col("preprocessed_finding").gather(tridx)
    ).with_columns(
        pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
        .alias("anatomic_classification")
    )
    test = filterset.select(
        pl.col("anatomic_classification").gather(tsidx),
        pl.col("preprocessed_finding").gather(tsidx)
    ).with_columns(
        pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
        .alias("anatomic_classification")
    )

    train_encodings = encoder.fit_transform(train["preprocessed_finding"])
    test_encodings = encoder.transform(test["preprocessed_finding"])
    y_true = test["anatomic_classification"]

    clf.fit(train_encodings, train["anatomic_classification"])
    y_pred = clf.predict(test_encodings)
    y_probas = clf.predict_proba(test_encodings)
    acc = accuracy_score(y_true, y_pred)
    cf = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)

    for k, v in cf.items():
        if k not in le.classes_:
            continue
        v.update({"fold": idx})
        v.update({"accuracy": acc})
        v.update({"class": k})
        firstrows.append(v)


In [109]:
(
    pl.DataFrame(firstrows).
    group_by("class").
    mean().
    select(pl.col(["class", "precision", "recall", "f1-score", "accuracy"]))
    .with_columns( (1 - pl.col("precision")).alias("move_rate"))
    .sort("move_rate")
)

class,precision,recall,f1-score,accuracy,move_rate
str,f64,f64,f64,f64,f64
"""LINES/TUBES/DRAINS""",0.927892,0.904264,0.915095,0.857577,0.072108
"""BONE AND SOFT TISSUE""",0.918687,0.636426,0.747271,0.857577,0.081313
"""CARDIAC/CARDIOMEDIASTINALSILHO…",0.91277,0.754967,0.824216,0.857577,0.08723
"""LUNG/PLEURA/LARGE AIRWAYS""",0.824128,0.964356,0.888417,0.857577,0.175872
"""POSITIONING/LIMITATIONS""",0.82373,0.717851,0.759879,0.857577,0.17627
"""MISCELLANEOUS""",0.800564,0.704122,0.746008,0.857577,0.199436


In [111]:
(
    pl.DataFrame(firstrows).
    group_by("class").
    mean().
    select(pl.col(["class", "precision", "recall", "f1-score", "accuracy"]))
    .with_columns( (1 - pl.col("precision")).alias("move_rate"))
    .sort("move_rate")
).select(pl.col(["precision", "recall", "f1-score", "accuracy", "move_rate"])).mean()

precision,recall,f1-score,accuracy,move_rate
f64,f64,f64,f64,f64
0.867962,0.780331,0.813481,0.857577,0.132038


The broadest categories seem to collect the most junk, essentially.
Of course, we can try to see what happens if we use a more sophisticated classifier than `LogisticRegression`

In [103]:
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

encoder = TfidfVectorizer()
kf = KFold(n_splits=10, shuffle=True, random_state=1026)
filterset = dataset.filter(( pl.col("anatomic_classification") != "NECK" ) & (pl.col("anatomic_classification") != "ABDOMEN"))
clf_list = [
    RidgeClassifier(),
    GaussianNB(),
    MLPClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier()
]
rows = []
for clf in tqdm(clf_list):
    for idx, (tridx, tsidx) in enumerate(kf.split(filterset, groups=filterset["anatomic_classification"])):
        train = filterset.select(
            pl.col("anatomic_classification").gather(tridx),
            pl.col("preprocessed_finding").gather(tridx)
        ).with_columns(
            pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
            .alias("anatomic_classification")
        )
        test = filterset.select(
            pl.col("anatomic_classification").gather(tsidx),
            pl.col("preprocessed_finding").gather(tsidx)
        ).with_columns(
            pl.col("anatomic_classification").map_elements(lambda s: le.transform([s])[0], return_dtype=pl.Int32)
            .alias("anatomic_classification")
        )

        train_encodings = encoder.fit_transform(train["preprocessed_finding"])
        test_encodings = encoder.transform(test["preprocessed_finding"])
        y_true = test["anatomic_classification"]

        clf.fit(train_encodings.toarray(), train["anatomic_classification"])
        y_pred = clf.predict(test_encodings.toarray())
        acc = accuracy_score(y_true, y_pred)
        cf = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)
        rows.append(
            {
                "accuracy": acc,
                "classifier": clf.__class__.__name__,
                **cf["weighted avg"]
            }
        )

  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [105]:
(
    pl.DataFrame(rows).
    group_by("classifier").
    mean().
    select(pl.col(["classifier", "precision", "recall", "f1-score", "accuracy"]))
    .with_columns( (1 - pl.col("precision")).alias("move_rate"))
    .sort("move_rate")
)

classifier,precision,recall,f1-score,accuracy,move_rate
str,f64,f64,f64,f64,f64
"""RidgeClassifier""",0.874286,0.872094,0.8703,0.872094,0.125714
"""RandomForestClassifier""",0.863689,0.856185,0.856321,0.856185,0.136311
"""SVC""",0.860406,0.851213,0.846813,0.851213,0.139594
"""MLPClassifier""",0.853963,0.852143,0.850907,0.852143,0.146037
"""GradientBoostingClassifier""",0.849066,0.84168,0.838786,0.84168,0.150934
"""DecisionTreeClassifier""",0.819104,0.810821,0.811639,0.810821,0.180896
"""GaussianNB""",0.710866,0.675609,0.680575,0.675609,0.289134
"""AdaBoostClassifier""",0.638514,0.597645,0.558981,0.597645,0.361486


Surprising that linear classifiers seem to work very well with TF-IDF.

### Deep Learning Classification

Evaluating how well deep learning models work with this problem