In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pouri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model("PICO_model.keras")

  saveable.load_own_variables(weights_store.get(inner_path))


In [8]:
# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation.replace('.', '').replace('!', '').replace('?', '')))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    return ' '.join(tokens)


In [9]:
def parse_pico_dataset(filepath):
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            if '|' in line:
                name, label, text = line.strip().split('|', 2)
                
                if label in ['P', 'I', 'C', 'O']:  # Filter only PICO elements
                    clean_text = preprocess_text(text)
#                     if label == 'R':
#                         label = 'O'
                    data.append({'label': label, 'text': clean_text})
    return pd.DataFrame(data)

# Load the dataset
df = parse_pico_dataset('.\\PICO\\data\\splitted\\PICO_train_relabelled.txt')

In [10]:
len(df[df['label']=='P']), len(df[df['label']=='I']), len(df[df['label']=='C']), len(df[df['label']=='O'])

(22347, 19864, 21745, 26229)

In [11]:
# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Text preprocessing
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['text'])
maxlen = 100
X = pad_sequences(sequences, maxlen=maxlen, padding='post')

# Labels as categorical data
y = to_categorical(df['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 126ms/step - accuracy: 0.8745 - loss: 0.4230
Test Accuracy: 0.87


In [18]:
model.predict(X_test[:10])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


array([[1.54898646e-06, 3.25949077e-04, 1.67259554e-04, 9.99505162e-01],
       [1.23752328e-03, 6.58587039e-01, 1.43010691e-02, 3.25874388e-01],
       [3.26739112e-03, 3.74726579e-02, 9.28430557e-01, 3.08292788e-02],
       [3.17992203e-06, 5.66759205e-04, 2.54260871e-04, 9.99175847e-01],
       [9.85577881e-01, 1.86040718e-03, 9.64939129e-03, 2.91233207e-03],
       [2.43335762e-05, 4.90276283e-03, 9.93438423e-01, 1.63455796e-03],
       [1.54046551e-03, 2.54655648e-02, 9.40001190e-01, 3.29928324e-02],
       [9.98053312e-01, 9.48893139e-05, 1.67329935e-03, 1.78466042e-04],
       [9.98481095e-01, 5.12088300e-05, 1.36278803e-03, 1.04903884e-04],
       [9.39649403e-01, 9.75910202e-03, 4.15848494e-02, 9.00661666e-03]],
      dtype=float32)

In [13]:
import rispy
import pandas as pd
from tqdm import tqdm
from itertools import chain

# Path to your RIS file
ris_file = 'TAS_200 records.ris'

# Read the RIS file with rispy
with open(ris_file, 'r') as file:
    records = rispy.load(file)

# Convert to a DataFrame
df = pd.DataFrame(records)

# Display the DataFrame
df.head(2)


Unnamed: 0,type_of_reference,title,authors,abstract,date,year,secondary_title,doi,volume,number,alternate_title1,start_page,end_page,urls,accession_number
0,JOUR,Screening for Cognitive Impairment in Older Ad...,"[Patnode, Carrie D., Perdue, Leslie A., Rossom...",OBJECTIVE: We conducted this systematic review...,2020///,2020,,,,,,,,,
1,JOUR,Interventions to Prevent Falls in Community-Dw...,"[Guirguis-Blake, Janelle M., Michael, Yvonne L...",OBJECTIVE: We conducted this systematic review...,2018///,2018,,,,,,,,,


In [35]:
pico_compliance_results = []
for text in df['abstract'].astype('str'):
    sentences = sent_tokenize(preprocess_text(str(text)))
    
    sequences = tokenizer.texts_to_sequences(sentences)
    maxlen = 100
    X = pad_sequences(sequences, maxlen=maxlen, padding='post')
    
    pred_prob = model.predict(X)
    
    required_flags = ['P', 'I', 'C', 'O']
    
    existing_flags = []
    for prob in pred_prob:
        existing_flags.append(required_flags[np.argmax(prob)])
    existing_flags = set(existing_flags)
    
    pico_compliant = set(required_flags).issubset(existing_flags)
    
    pico_compliance_results.append({
        "PICO_Compliant": pico_compliant,
        "P": 'P' in existing_flags,
        "I": 'I' in existing_flags,
        "C": 'C' in existing_flags,
        "O": 'O' in existing_flags
    })

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63

In [37]:
df = pd.concat([df, pd.DataFrame(pico_compliance_results)], axis=1)

In [39]:
df.head(2)

Unnamed: 0,type_of_reference,title,authors,abstract,date,year,secondary_title,doi,volume,number,alternate_title1,start_page,end_page,urls,accession_number,PICO_Compliant,P,I,C,O
0,JOUR,Screening for Cognitive Impairment in Older Ad...,"[Patnode, Carrie D., Perdue, Leslie A., Rossom...",OBJECTIVE: We conducted this systematic review...,2020///,2020,,,,,,,,,,True,True,True,True,True
1,JOUR,Interventions to Prevent Falls in Community-Dw...,"[Guirguis-Blake, Janelle M., Michael, Yvonne L...",OBJECTIVE: We conducted this systematic review...,2018///,2018,,,,,,,,,,True,True,True,True,True


In [42]:
df.to_excel("TAS_200_with_PICO_flags.xlsx")