In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy
import joblib
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

In [10]:
# Download NLTK data for lemmatization
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sayoojcyriac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Function to apply lemmatization on StudyDescription
def lemmatize_text(text):
    if isinstance(text, str):
        words = text.split()
        return ' '.join([lemmatizer.lemmatize(word) for word in words])
    return ''

In [12]:
# Load the dataset
# Update the file_path to the location of your dataset
file_path = './ExamTypeIdentifier.xlsx'  # Replace with your actual file path
data = pd.read_excel(file_path)

In [13]:
# Display the first few rows of the data
print("First 5 rows of the dataset:")
print(data.head())

First 5 rows of the dataset:
                                    StudyDescription Modality  OutcomeClass
0  CT CT CHEST INFUSION PULM EMBOL CT INFUSION PU...       CT      CT_CHEST
1  CT PULM CT CHEST INFUSION PULM EMBOL CT CHEST ...       CT  CT_PULMONARY
2                 CT INFUSION PULM CT ABD CT ABDO CT       CT    CT_ABDOMEN
3    CT PUL CT ABD CT INFUSION PULM EMBOLCT CHEST CT       CT  CT_PULMONARY
4  CT ABDOM CT ABD CT CHEST INFUSION PULM EMBOL C...       CT    CT_ABDOMEN


In [16]:
import nltk

# Download the necessary NLTK resource
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sayoojcyriac/nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sayoojcyriac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
# Apply lemmatization to the StudyDescription column
data['StudyDescription'] = data['StudyDescription'].apply(lemmatize_text)

In [18]:
# Features and Target
X = data[['StudyDescription', 'Modality']]
y = data['OutcomeClass']

In [19]:
# Split the dataset into training and testing sets
# Adjust test_size as needed
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [20]:
# Define the preprocessing steps for each column
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=500), 'StudyDescription'),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Modality'])
    ]
)

In [21]:
# Create a pipeline that first preprocesses the data and then applies LogisticRegression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [22]:
# Train the LogisticRegression model
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_features=500),
                                                  'StudyDescription'),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Modality'])])),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42))])

In [23]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [24]:
# Generate the classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report for LogisticRegression:")
print(report)


Classification Report for LogisticRegression:
              precision    recall  f1-score   support

  CT_ABDOMEN       0.60      0.75      0.67         4
    CT_CHEST       0.67      0.67      0.67         3
CT_PULMONARY       0.00      0.00      0.00         3

    accuracy                           0.50        10
   macro avg       0.42      0.47      0.44        10
weighted avg       0.44      0.50      0.47        10



In [25]:
# Save the trained pipeline (which includes the model and preprocessors)
model_file_path = 'trained_logistic_model.pkl'  # You can change the path as needed
joblib.dump(pipeline, model_file_path)
print(f"\nTrained model and preprocessors saved to '{model_file_path}'")


Trained model and preprocessors saved to 'trained_logistic_model.pkl'
