In [1]:
import PyPDF2
import pandas as pd

# Open the pdf file
pdf1 = PyPDF2.PdfReader('Resources/pubmed.pdf')
pdf2 = PyPDF2.PdfReader('Resources/pubmed2.pdf')
pdfs = [pdf1, pdf2]
# Create a dataframe to store the data from each page
texts=[]
names=[]
count=1
for pdf in pdfs:
    name = f'pdf{count}'
    count+=1
    # Get the number of pages in the pdf file
    num_pages = len(pdf.pages)

    # Iterate over the pages in the pdf file
    for i in range(num_pages):

        # Get the text from the current page
        page = pdf.pages[i]
        text = page.extract_text()

        # Split the text into sections
        sections = text.split('\n\n')
        texts.append(sections[0])
        names.append(name)

# Add the section to the dataframe
pubmed_df = pd.DataFrame({'doc': names, 'page_text': texts})
pubmed_df

Unnamed: 0,doc,page_text
0,pdf1,\n \nSince January 2020 Elsevier has created ...
1,pdf1,RESEARCH\nCharacteristics of online pharmacies...
2,pdf1,illegitimate and in violation of U.S. pharmacy...
3,pdf1,"illegal, unsafe, or misleading activities like..."
4,pdf1,The safety characteristics of all online pharm...
5,pdf1,phone number listed and an offer to speak with...
6,pdf1,without a prescription.22Our results demonstra...
7,pdf1,"marketplace. In the meantime, organizations sh..."
8,pdf2,The new england journal of medicinen engl j me...
9,pdf2,"n engl j med 380;12 nejm.org March 21, 2019 11..."


## Attempt at T5

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Set up the summarization pipeline
tokenizer = AutoTokenizer.from_pretrained("t5-base",max_input_length = 1024)
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Define a function to generate summaries for each section
def generate_summary(text):
    # Generate a summary of the text using the summarization pipelineb
     
    summary = summarizer(text, max_length=80, min_length=25, do_sample=False)[0]['summary_text']
    return summary

2023-11-07 14:44:13.767392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# Add a new column to the dataframe for the summaries
pubmed_df['Summary'] = pubmed_df['page_text'].apply(generate_summary)

  "You have modified the pretrained model configuration to control generation. This is a"
Token indices sequence length is longer than the specified maximum sequence length for this model (1104 > 512). Running this sequence through the model will result in indexing errors


In [4]:
pubmed_df['Summary'][1]

'of the 62 online pharmacies found to sell Adderall, 61 were rogue or unclassi fied . adderall carries a high potential for abuse, which could lead to severe psychological or physical dependence .'

## Sentiment analysys with twitter-roberta-base-sentiment

In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


OSError: Can't load tokenizer for 'cardiffnlp/twitter-roberta-base-sentiment'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'cardiffnlp/twitter-roberta-base-sentiment' is the correct path to a directory containing all relevant files for a RobertaTokenizerFast tokenizer.

## Sentiment analysis of Adderall reviews with twitter-xlm-roberta-base

In [8]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader

%matplotlib inline

In [11]:
adderall_df = pd.read_csv('Resources/adderall.csv')
adderall_df.head()

Unnamed: 0,Age,Condition,Date,Drug,Sex,Reviews
0,,Attention Deficit Disorder with Hyperactivity,2023-04-29,adderall xr,,No Script or health Insurance needed to place ...
1,25-34,Attention Deficit Disorder with Hyperactivity,2023-01-07,adderall xr,Female,I realize my previous comment was more about m...
2,25-34,Other,2023-01-30,adderall xr,Female,I realize my previous comment was more about m...
3,25-34,Attention Deficit Disorder with Hyperactivity,2023-03-08,adderall xr,Female,"My entire life I felt like I was different, ne..."
4,25-34,Attention Deficit Disorder with Hyperactivity,2023-06-09,adderall xr,Female,Been taking it since I was 10. So over 20 year...


In [10]:
from transformers import pipeline
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
def sent_pred(text):
    sent = sentiment_task(text)
    if str(sent[0]['label']) == 'Positive':
        result = int(1)
        return result
    elif str(sent[0]['label']) == 'Negative':
        result = int(0)
        return result
    else:
        if sent[0]['score'] > 0.5:
            return 1
        else:
            return 0
  

In [14]:
adderall_df['sentiment'] = adderall_df.Reviews.map(lambda x: sent_pred(x))
df_testing

RuntimeError: The expanded size of the tensor (539) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 539].  Tensor sizes: [1, 514]

Unnamed: 0,Age,Condition,Date,Drug,Sex,Reviews,sentiment
0,,Attention Deficit Disorder with Hyperactivity,2023-04-29,adderall xr,,No Script or health Insurance needed to place ...,1
1,25-34,Attention Deficit Disorder with Hyperactivity,2023-01-07,adderall xr,Female,I realize my previous comment was more about m...,1
2,25-34,Other,2023-01-30,adderall xr,Female,I realize my previous comment was more about m...,1
3,25-34,Attention Deficit Disorder with Hyperactivity,2023-03-08,adderall xr,Female,"My entire life I felt like I was different, ne...",1
4,25-34,Attention Deficit Disorder with Hyperactivity,2023-06-09,adderall xr,Female,Been taking it since I was 10. So over 20 year...,1


[' ' '25-34' '35-44' '65-74' '55-64' '13-18' '19-24' '45-54' '7-12' '3-6'
 '75 or over']


Unnamed: 0,Age,Condition,Date,Drug,Sex,Reviews,sentiment
43,7-12,Attention Deficit Disorder with Hyperactivity,2023-03-15,adderall xr,Male,I have a son on this medication and need to kn...,1
45,7-12,Attention Deficit Disorder with Hyperactivity,2023-05-11,adderall xr,Female,My daughter is 10 yrs old. She has been on it ...,1
46,7-12,Attention Deficit Disorder with Hyperactivity,2023-02-27,adderall xr,Male,this has helped in being able to deal with his...,1
57,3-6,Attention Deficit Disorder with Hyperactivity,2023-04-19,adderall xr,Male,well my son has been on all kinds of ADHD meds...,1
63,7-12,Attention Deficit Disorder with Hyperactivity,2023-02-20,adderall xr,Female,My 9yr old son has been on Adderall for almost...,1


In [33]:
def map_age_to_category(age):
    if age in ['3-6', '7-12']:
        return 'child'
    elif age in '13-18':
        return 'adolescent'
    elif age in ['19-24', '25-34', '35-44', '45-54', '55-64']:
        return 'adult'
    elif age in ['65-74', '75 or over']:
        return 'senior'
    else:
        return 'unknown'

In [34]:
adderall_df['Age_Range'] = adderall_df['Age'].apply(map_age_to_category)
adderall_df

Unnamed: 0,Age,Condition,Date,Drug,Sex,Reviews,sentiment,Age_Range
0,,Attention Deficit Disorder with Hyperactivity,2023-04-29,adderall xr,,No Script or health Insurance needed to place ...,1,unknown
1,25-34,Attention Deficit Disorder with Hyperactivity,2023-01-07,adderall xr,Female,I realize my previous comment was more about m...,1,adult
2,25-34,Other,2023-01-30,adderall xr,Female,I realize my previous comment was more about m...,1,adult
3,25-34,Attention Deficit Disorder with Hyperactivity,2023-03-08,adderall xr,Female,"My entire life I felt like I was different, ne...",1,adult
4,25-34,Attention Deficit Disorder with Hyperactivity,2023-06-09,adderall xr,Female,Been taking it since I was 10. So over 20 year...,1,adult
...,...,...,...,...,...,...,...,...
505,25-34,Attention Deficit Disorder with Hyperactivity,2023-04-11,adderall xr,Female,The only problem I had: it would quit working ...,1,adult
506,19-24,Attention Deficit Disorder with Hyperactivity,2023-01-20,adderall xr,Male,"I have a urine test in a few days, and I wante...",1,adult
507,45-54,Attention Deficit Disorder with Hyperactivity,2023-04-29,adderall xr,Female,"I have ADD, not ADHD. I noticed a difference ...",1,adult
508,35-44,Attention Deficit Disorder with Hyperactivity,2023-05-03,adderall xr,Female,"I can sleep at night, but I do't need naps dur...",1,adult
