In [None]:
import os
import pandas as pd
import numpy as np

pd.set_option("max_columns", 300)
os.chdir(os.getcwd().replace('notebooks','').replace('medi_crawler',''))

In [None]:
import config as CONFIG

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
classifier('We are very happy to show you the 🤗 Transformers library.')

#### data set 

In [None]:
import pickle

file = open("data/medi_crawler/raw/meta_arts.pickle", 'rb')
object_file = pickle.load(file)
file.close()


import pandas as pd

data = {
    'pmid': [],
    'text': [],
    'title': []
}

for entry in object_file.values():
    data['pmid'].append(entry['pmid'])
    data['text'].append(entry['abstract'])
    data['title'].append(entry['title'])

df = pd.DataFrame(data)
df = df.set_index('pmid')
df.head()

In [None]:
# get true  labeled pearls

pearls_df = pd.read_csv("data/medi_crawler/raw/df_pearls.csv",sep=';',index_col=0)
label_true = pd.DataFrame(data=[True]*len(pearls_df.columns), columns=["label"], index=pearls_df.columns)
label_true = label_true.join(df)


In [None]:
# get false labeled pearls
label_data = pd.read_csv("data/medi_crawler/processed/data_cleaned.csv")


label_false = pd.DataFrame(label_data[label_data.category!="pot_pearl"]['pmid'])
label_false = label_false.set_index("pmid")
label_false['label'] = False
label_false = label_false.join(df)

In [None]:
# get false data 

from src.medi_crawler.download import get_abstract, get_title
abstract = get_abstract(list(label_false.index))
title = get_title(list(label_false.index))

title = title.set_index('pmid')
abstract = abstract.set_index('pmid')

label_false = label_false.join(title).join(abstract)
label_false = label_false.join(title).join(abstract)

label_false = label_false.rename(columns={'abstract': 'text'})

In [None]:
model_data = pd.concat([label_true, label_false])
model_data.to_csv('data/medi_crawler/processed/model_data.csv')

In [None]:
predict_data = df[[i not in model_data.index for i in df.index]]
predict_data.to_csv('data/medi_crawler/processed/predict_data.csv')

In [None]:
label_false = label_false.drop(['text','title'],axis=1)

# Model

In [None]:
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
BATCH_SIZE = 8
N_EPOCHS = 30
MAX_LEN = 300

In [None]:
predict_data = pd.read_csv('data/medi_crawler/processed/predict_data.csv',index_col=0)[['text']]
model_data = pd.read_csv('data/medi_crawler/processed/model_data.csv',index_col=0)[['text','label']]

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame called 'df' with features and labels
X = model_data.drop('label', axis=1)  # Features
y = model_data['label']  # Labels

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Convert X_train and X_test to list of strings
X_train = list(X_train.squeeze().values)
X_test = list(X_test.squeeze().values)

In [None]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
#tokenize the text
train_encodings = tokenizer(list(X_train),
                            truncation=True, 
                            padding=True)
test_encodings = tokenizer(list(X_test),
                           truncation=True, 
                           padding=True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(y_train.values)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(y_test.values)))

In [None]:

import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Instantiate the model and tokenizer
MODEL_NAME = "distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Choose the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define the number of epochs and batch size
N_EPOCHS = 10
BATCH_SIZE = 32

# Set the model to training mode
model.train()

# Train the model
for epoch in range(N_EPOCHS):
    epoch_loss = 0.0
    epoch_correct = 0
    total_samples = 0

    for batch in train_dataloader:
        # Extract the inputs and labels from the batch
        inputs = tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt")
        labels = batch["label"]

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute the loss
        loss = loss_fn(logits, labels)

        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update epoch statistics
        epoch_loss += loss.item() * len(labels)
        _, predicted_labels = torch.max(logits, 1)
        epoch_correct += (predicted_labels == labels).sum().item()
        total_samples += len(labels)

    epoch_accuracy = epoch_correct / total_samples
    epoch_loss /= total_samples

    print(f"Epoch {epoch+1} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

# Set the model back to evaluation mode
model.eval()


In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
#chose the optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
#define the loss function 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#build the model
model.compile(optimizer=optimizerr,
              loss=losss,
              metrics=['accuracy'])
# train the model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

In [None]:
file_path = "model/medi_crawler/abstract_prediction.pickle"

# Save the classification report as a pickle file
with open(file_path, "wb") as file:
    pickle.dump(model, file)

In [None]:


# Predict
preds = model.predict(test_dataset).logits

preds = tf.nn.softmax(preds, axis=1).numpy()  

In [None]:
y_preds = np.round(preds[:,1],0)

from sklearn.metrics import classification_report

# Assuming you have the predicted labels 'y_pred' and the true labels 'y_true'
report = classification_report(y_test, y_preds)

print(report)


In [None]:
predict_data[~predict_data['text'].isna()]['text']

In [None]:
X_new = predict_data[~predict_data['text'].isna()]['text'].squeeze().values

encodings = tokenizer(X_new.squeeze().tolist(), 
                      max_length=300, 
                      truncation=True, 
                      padding=True)


In [None]:

# Transform to tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))

# Predict
preds = model.predict(dataset).logits

preds = tf.nn.softmax(preds, axis=1).numpy()  

In [None]:
predict_data = pd.read_csv('data/medi_crawler/processed/predict_data.csv',index_col=0)

predict_data_title = predict_data[~predict_data['text'].isna()]['title']

In [None]:
X_new

In [None]:
pd.concat([predict_data_title,pd.DataFrame(preds[:,1],index=predict_data_title.index,columns=['pred'])],axis=1).sort_values('pred',ascending=False).to_csv("data/medi_crawler/final/predict_label_text.csv")


In [None]:
pd.concat([predict_data_title,pd.DataFrame(preds[:,1],index=predict_data_title.index,columns=['pred'])],axis=1).sort_values('pred',ascending=False)

In [None]:
def predict_proba(text_list, model, tokenizer):  
    #tokenize the text
    encodings = tokenizer(text_list, 
                          max_length=15000, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res

predict_proba(strings_list[3], model, tokenizer)[:,1].mean()