In [1]:
import nltk
import spacy
import pandas as pd
import re
import tensorflow as tf

In [2]:
import json
with open('tweets.json') as file:
    df = json.load(file)

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
tweets = []
authors = []
for key in df:
    tweets.append(df[key]['tweet_text'])
    authors.append(df[key]['tweet_author'])

In [5]:
authors[:5]

['Hematopoiesis News',
 'Michael Wang, MD',
 '1stOncology',
 'Toby Eyre',
 'Lymphoma Hub']

In [6]:
data = {'Author': authors, 'Tweet': tweets}
df = pd.DataFrame(data)

In [7]:
df["cleaned_tweets"] = [re.sub(r'http\S+', '', tweet) for tweet in df['Tweet']]

df["cleaned_tweets"][0]

'⚕️ Scientists conducted a Phase II study of acalabrutinib in patients with relapsed/refractory #CLL who were ibrutinib-intolerant, and found an overall response rate of 73%. \n '

In [8]:
df["cleaned_tweets"] = [re.sub(r'#\S+', '', tweet) for tweet in df["cleaned_tweets"]]

df["cleaned_tweets"][0]

'⚕️ Scientists conducted a Phase II study of acalabrutinib in patients with relapsed/refractory  who were ibrutinib-intolerant, and found an overall response rate of 73%. \n '

In [9]:
df["cleaned_tweets"] = [re.sub(r'[^A-Za-z0-9\s]+', '', tweet) for tweet in df["cleaned_tweets"]]

df["cleaned_tweets"][0]

' Scientists conducted a Phase II study of acalabrutinib in patients with relapsedrefractory  who were ibrutinibintolerant and found an overall response rate of 73 \n '

In [10]:
df["cleaned_tweets"] = [re.sub(r'\n', '', tweet) for tweet in df["cleaned_tweets"]]

df["cleaned_tweets"][0]

' Scientists conducted a Phase II study of acalabrutinib in patients with relapsedrefractory  who were ibrutinibintolerant and found an overall response rate of 73  '

In [11]:
df["cleaned_tweets"] = [re.sub(r'\d+', '', tweet) for tweet in df["cleaned_tweets"]]

df["cleaned_tweets"][1]

'This phase  AcalabrutinibVenetoclax AV trial that is still in recruitment phase will study how well venetoclax and acalabrutinib works in MCL patients who either relapsed or nonrespondent to the initial therapy'

In [12]:
nlp = spacy.load("en_core_web_sm")

def text_preprocessing(text):
    doc = nlp(text)
    removing_stopword =  [token for token in doc if not token.is_stop]
    lemmatization = [token.lemma_ for token in removing_stopword]
    preprocessed_tweets = " ".join(lemmatization)
    return preprocessed_tweets

In [13]:
df["cleaned_tweets"] = df["cleaned_tweets"].apply(text_preprocessing)

In [14]:
def extracting_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
               
    return entities

In [15]:
df['entities'] = df['cleaned_tweets'].apply(extracting_entities)

In [16]:
df['entities'] = df['entities'].apply(lambda x: ' '.join(x))

In [17]:
df['combined_tweets'] = df['Author'] + " " + df['cleaned_tweets'] + " " + df['entities']

In [18]:
df.head()

Unnamed: 0,Author,Tweet,cleaned_tweets,entities,combined_tweets
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,scientist conduct Phase II study acalabrutin...,Phase II,Hematopoiesis News scientist conduct Phase I...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,phase AcalabrutinibVenetoclax AV trial recru...,AcalabrutinibVenetoclax AV MCL,"Michael Wang, MD phase AcalabrutinibVenetocl..."
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,back,,1stOncology back
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,valuable option pt intolerant valuable dat...,,Toby Eyre valuable option pt intolerant va...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...,,Lymphoma Hub NICE recommend use acalabrutinib ...


In [19]:
from transformers import AutoTokenizer, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
model = TFAutoModel.from_pretrained("albert-base-v1")

Some layers from the model checkpoint at albert-base-v1 were not used when initializing TFAlbertModel: ['predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertModel were initialized from the model checkpoint at albert-base-v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


In [20]:
def predict_sentiment(text):
    input_ids = tokenizer.encode(text, return_tensors='tf')
    logits = model(input_ids).last_hidden_state
    logits = tf.reduce_mean(logits, axis=-1)
    sentiment = tf.where(tf.math.sigmoid(logits) >= 0.5, 1, 0)
    return sentiment.numpy().astype(int)

In [21]:
df['tokens_sentiment'] = df['combined_tweets'].apply(predict_sentiment)

In [22]:
df['tokens_sentiment'][0]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [23]:
df.head()

Unnamed: 0,Author,Tweet,cleaned_tweets,entities,combined_tweets,tokens_sentiment
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,scientist conduct Phase II study acalabrutin...,Phase II,Hematopoiesis News scientist conduct Phase I...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,phase AcalabrutinibVenetoclax AV trial recru...,AcalabrutinibVenetoclax AV MCL,"Michael Wang, MD phase AcalabrutinibVenetocl...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,back,,1stOncology back,"[[0, 0, 0, 0, 0, 0, 1]]"
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,valuable option pt intolerant valuable dat...,,Toby Eyre valuable option pt intolerant va...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...,,Lymphoma Hub NICE recommend use acalabrutinib ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [24]:
import numpy as np

def convert_to_single_value(arr):
    return np.where(np.mean(arr) >= 0.5, 1, 0).item()

In [25]:
df['sentiment'] = df['tokens_sentiment'].apply(convert_to_single_value)

In [26]:
df.head()

Unnamed: 0,Author,Tweet,cleaned_tweets,entities,combined_tweets,tokens_sentiment,sentiment
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,scientist conduct Phase II study acalabrutin...,Phase II,Hematopoiesis News scientist conduct Phase I...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,phase AcalabrutinibVenetoclax AV trial recru...,AcalabrutinibVenetoclax AV MCL,"Michael Wang, MD phase AcalabrutinibVenetocl...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,back,,1stOncology back,"[[0, 0, 0, 0, 0, 0, 1]]",0
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,valuable option pt intolerant valuable dat...,,Toby Eyre valuable option pt intolerant va...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...,,Lymphoma Hub NICE recommend use acalabrutinib ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0


In [27]:
df['overall_polarity'] = df['sentiment'].map({0: 'negative', 1: 'positive'})

In [28]:
df.head()

Unnamed: 0,Author,Tweet,cleaned_tweets,entities,combined_tweets,tokens_sentiment,sentiment,overall_polarity
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,scientist conduct Phase II study acalabrutin...,Phase II,Hematopoiesis News scientist conduct Phase I...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,negative
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,phase AcalabrutinibVenetoclax AV trial recru...,AcalabrutinibVenetoclax AV MCL,"Michael Wang, MD phase AcalabrutinibVenetocl...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,negative
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,back,,1stOncology back,"[[0, 0, 0, 0, 0, 0, 1]]",0,negative
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,valuable option pt intolerant valuable dat...,,Toby Eyre valuable option pt intolerant va...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,negative
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...,,Lymphoma Hub NICE recommend use acalabrutinib ...,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,negative


In [29]:
df_final = df[['entities', 'Author', 'overall_polarity']].copy()

In [30]:
df_final.head()

Unnamed: 0,entities,Author,overall_polarity
0,Phase II,Hematopoiesis News,negative
1,AcalabrutinibVenetoclax AV MCL,"Michael Wang, MD",negative
2,,1stOncology,negative
3,,Toby Eyre,negative
4,,Lymphoma Hub,negative


In [31]:
df_final.to_csv('objective2.csv', index=False)