In [6]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# import gensim
# from gensim.summarization import summarize

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score , confusion_matrix

In [7]:
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags
xml_dir = 'n2c2/n2c2/part1'
data = []

# XML files
for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        Choosen_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        Choosen_tags['text'] = text
        data.append(Choosen_tags)

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smrh1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
#preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    words = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply preprocessing to the text
df['clean_text'] = df['text'].apply(preprocess_text)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
# changing_labels
df['ABDOMINAL_encoded'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE_encoded'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES_encoded'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

# labels to a single DataFrame
y = df[['ABDOMINAL_encoded', 'CREATININE_encoded', 'MAJOR-DIABETES_encoded']]

# Spliting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train Linear SVM
model = OneVsRestClassifier(LinearSVC())
model.fit(X_train, y_train)



In [9]:
y_pred = model.predict(X_test)

In [10]:
report = classification_report(y_test, y_pred, output_dict=True)

report_df = pd.DataFrame(report).transpose()

# Calculating mean and standard deviation for precision, recall, f1-score
metrics = ['precision', 'recall', 'f1-score']
mean_std = report_df[metrics].apply([np.mean, np.std])

# accuracy
accuracy = accuracy_score(y_test, y_pred)

# Create a DataFrame for accuracy
accuracy_df = pd.DataFrame({'accuracy': [accuracy]})

# Print the classification report
print(classification_report(y_test, y_pred))

# confusion matrix
for i, label in enumerate(['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']):
    print(f'Confusion matrix for {label}:')
    print(confusion_matrix(y_test.iloc[:, i], y_pred[:, i]))

# Print the mean and standard deviation of the classification report metrics including accuracy
print("Mean and Standard Deviation of Metrics:")
print(mean_std)
print("\nAccuracy:")
print(accuracy_df)

              precision    recall  f1-score   support

           0       0.71      0.56      0.62         9
           1       0.50      0.30      0.38        10
           2       0.71      0.77      0.74        13

   micro avg       0.67      0.56      0.61        32
   macro avg       0.64      0.54      0.58        32
weighted avg       0.65      0.56      0.59        32
 samples avg       0.47      0.44      0.43        32

Confusion matrix for ABDOMINAL:
[[10  2]
 [ 4  5]]
Confusion matrix for CREATININE:
[[8 3]
 [7 3]]
Confusion matrix for MAJOR-DIABETES:
[[ 4  4]
 [ 3 10]]
Mean and Standard Deviation of Metrics:
      precision    recall  f1-score
mean   0.621953  0.532556  0.564577
std    0.098849  0.142435  0.124325

Accuracy:
   accuracy
0  0.333333


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import joblib
joblib.dump(model, 'Linear_SVC_model.pkl')

['Linear_SVC_model.pkl']