<a href="https://colab.research.google.com/github/rajeev-dw9/Scrapper_DSIP/blob/main/Scrapper_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Web Scrapping Assignment
# Name: Rajeev Ranjan Dwivedi
# Date: 26/10/2022
# Description: This program scrapes the data from the website (https://ascopubs.org/loi/jco) and stores it in a csv file.



In [10]:
# reset all variable, since it creates problem while running the code again and again
%reset -f 

In [11]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import csv


import time
import re
import os
import sys
import json
import datetime

In [12]:
# Get the data from the website
response = requests.get("https://ascopubs.org/loi/jco") # get the data from the website
soup = BeautifulSoup(response.text, "html.parser") # parse the data using beautiful soup

list_of_links = [] # list of links
all_links = soup.find_all("a") # find all links
for link in all_links:
    list_of_links.append(link.get("href")) # append the link to the list
    
# print(list_of_links) 

In [13]:
# Store all the links in a dataframe
df = pd.DataFrame(list_of_links, columns = ["Links"])
df.head(100)

Unnamed: 0,Links
0,#main
1,/journal/jco
2,https://signin.asco.org/oauth2/aus1goap5nQ33vo...
3,http://www.editorialmanager.com/jco-ascopubs/d...
4,https://apps.asco.org/EmailPreferences/Home/Up...
...,...
95,/toc/jco/40/7
96,/toc/jco/40/6
97,/toc/jco/40/5
98,/toc/jco/40/4


In [14]:
# remove columns with null values
df = df.dropna()
df.head(100)

# remove the links that are not related to the journal articles
df = df[df["Links"].str.contains("/toc/jco")]
df.shape

(967, 1)

In [15]:
from tqdm import tqdm

url = "https://ascopubs.org/toc/jco/"

data = []

for i in tqdm(range(1, 41)):
    out_url = url + str(i)
    index = 1
    
    year_data = []
    
    while True:
        titles = []
        authors = []
        articles = []
        
        target_url = out_url + "/" + str(index)
        
        response = requests.get(target_url) # get the data from the website
        if response.status_code == 404:
            break
        soup = BeautifulSoup(response.text, "html.parser")
        print(index, target_url, end=" ")
        
        tables = soup.find_all("table", {"class": "articleEntry"})
        for table in tables:
            # getting title
            title = table.find_all("span", {"class":"hlFld-Title"})
            if(len(title) > 0):
                titles.append(title[0].text)
            else:
                titles.append("")
                
            # getting author
            author = table.find_all("span", {"class":"entryAuthor"})
            if (len(author) > 0):
                authors.append(author[0].text)
            else:
                authors.append("")

            # getting pdf url
            pdf_url = table.find_all("a", {"class":"pdf"})[0].get('href')
            if not pdf_url.startswith('https://ascopubs.org/'):
                pdf_url = 'https://ascopubs.org' + pdf_url
            articles.append(pdf_url)
        year_data.append([titles, authors, articles])
        index += 1
        
    data.append(year_data)

  0%|          | 0/40 [00:00<?, ?it/s]

1 https://ascopubs.org/toc/jco/1/1 2 https://ascopubs.org/toc/jco/1/2 3 https://ascopubs.org/toc/jco/1/3 4 https://ascopubs.org/toc/jco/1/4 5 https://ascopubs.org/toc/jco/1/5 6 https://ascopubs.org/toc/jco/1/6 7 https://ascopubs.org/toc/jco/1/7 8 https://ascopubs.org/toc/jco/1/8 9 https://ascopubs.org/toc/jco/1/9 10 https://ascopubs.org/toc/jco/1/10 11 https://ascopubs.org/toc/jco/1/11 12 https://ascopubs.org/toc/jco/1/12 

  2%|▎         | 1/40 [00:13<09:02, 13.91s/it]

1 https://ascopubs.org/toc/jco/2/1 2 https://ascopubs.org/toc/jco/2/2 3 https://ascopubs.org/toc/jco/2/3 4 https://ascopubs.org/toc/jco/2/4 5 https://ascopubs.org/toc/jco/2/5 6 https://ascopubs.org/toc/jco/2/6 7 https://ascopubs.org/toc/jco/2/7 8 https://ascopubs.org/toc/jco/2/8 9 https://ascopubs.org/toc/jco/2/9 10 https://ascopubs.org/toc/jco/2/10 11 https://ascopubs.org/toc/jco/2/11 12 https://ascopubs.org/toc/jco/2/12 

  5%|▌         | 2/40 [00:32<10:35, 16.73s/it]

1 https://ascopubs.org/toc/jco/3/1 2 https://ascopubs.org/toc/jco/3/2 3 https://ascopubs.org/toc/jco/3/3 4 https://ascopubs.org/toc/jco/3/4 5 https://ascopubs.org/toc/jco/3/5 6 https://ascopubs.org/toc/jco/3/6 7 https://ascopubs.org/toc/jco/3/7 8 https://ascopubs.org/toc/jco/3/8 9 https://ascopubs.org/toc/jco/3/9 10 https://ascopubs.org/toc/jco/3/10 11 https://ascopubs.org/toc/jco/3/11 12 https://ascopubs.org/toc/jco/3/12 

  8%|▊         | 3/40 [00:50<10:37, 17.24s/it]

1 https://ascopubs.org/toc/jco/4/1 2 https://ascopubs.org/toc/jco/4/2 3 https://ascopubs.org/toc/jco/4/3 4 https://ascopubs.org/toc/jco/4/4 5 https://ascopubs.org/toc/jco/4/5 6 https://ascopubs.org/toc/jco/4/6 7 https://ascopubs.org/toc/jco/4/7 8 https://ascopubs.org/toc/jco/4/8 9 https://ascopubs.org/toc/jco/4/9 10 https://ascopubs.org/toc/jco/4/10 11 https://ascopubs.org/toc/jco/4/11 12 https://ascopubs.org/toc/jco/4/12 

 10%|█         | 4/40 [01:11<11:12, 18.67s/it]

1 https://ascopubs.org/toc/jco/5/1 2 https://ascopubs.org/toc/jco/5/2 3 https://ascopubs.org/toc/jco/5/3 4 https://ascopubs.org/toc/jco/5/4 5 https://ascopubs.org/toc/jco/5/5 6 https://ascopubs.org/toc/jco/5/6 7 https://ascopubs.org/toc/jco/5/7 8 https://ascopubs.org/toc/jco/5/8 9 https://ascopubs.org/toc/jco/5/9 10 https://ascopubs.org/toc/jco/5/10 11 https://ascopubs.org/toc/jco/5/11 12 https://ascopubs.org/toc/jco/5/12 

 12%|█▎        | 5/40 [01:32<11:30, 19.73s/it]

1 https://ascopubs.org/toc/jco/6/1 2 https://ascopubs.org/toc/jco/6/2 3 https://ascopubs.org/toc/jco/6/3 4 https://ascopubs.org/toc/jco/6/4 5 https://ascopubs.org/toc/jco/6/5 6 https://ascopubs.org/toc/jco/6/6 7 https://ascopubs.org/toc/jco/6/7 8 https://ascopubs.org/toc/jco/6/8 9 https://ascopubs.org/toc/jco/6/9 10 https://ascopubs.org/toc/jco/6/10 11 https://ascopubs.org/toc/jco/6/11 12 https://ascopubs.org/toc/jco/6/12 

 15%|█▌        | 6/40 [01:51<10:59, 19.39s/it]

1 https://ascopubs.org/toc/jco/7/1 2 https://ascopubs.org/toc/jco/7/2 3 https://ascopubs.org/toc/jco/7/3 4 https://ascopubs.org/toc/jco/7/4 5 https://ascopubs.org/toc/jco/7/5 6 https://ascopubs.org/toc/jco/7/6 7 https://ascopubs.org/toc/jco/7/7 8 https://ascopubs.org/toc/jco/7/8 9 https://ascopubs.org/toc/jco/7/9 10 https://ascopubs.org/toc/jco/7/10 11 https://ascopubs.org/toc/jco/7/11 12 https://ascopubs.org/toc/jco/7/12 

 18%|█▊        | 7/40 [02:11<10:47, 19.62s/it]

1 https://ascopubs.org/toc/jco/8/1 2 https://ascopubs.org/toc/jco/8/2 3 https://ascopubs.org/toc/jco/8/3 4 https://ascopubs.org/toc/jco/8/4 5 https://ascopubs.org/toc/jco/8/5 6 https://ascopubs.org/toc/jco/8/6 7 https://ascopubs.org/toc/jco/8/7 8 https://ascopubs.org/toc/jco/8/8 9 https://ascopubs.org/toc/jco/8/9 10 https://ascopubs.org/toc/jco/8/10 11 https://ascopubs.org/toc/jco/8/11 12 https://ascopubs.org/toc/jco/8/12 

 20%|██        | 8/40 [02:35<11:08, 20.89s/it]

1 https://ascopubs.org/toc/jco/9/1 2 https://ascopubs.org/toc/jco/9/2 3 https://ascopubs.org/toc/jco/9/3 4 https://ascopubs.org/toc/jco/9/4 5 https://ascopubs.org/toc/jco/9/5 6 https://ascopubs.org/toc/jco/9/6 7 https://ascopubs.org/toc/jco/9/7 8 https://ascopubs.org/toc/jco/9/8 9 https://ascopubs.org/toc/jco/9/9 10 https://ascopubs.org/toc/jco/9/10 11 https://ascopubs.org/toc/jco/9/11 12 https://ascopubs.org/toc/jco/9/12 

 22%|██▎       | 9/40 [02:58<11:07, 21.53s/it]

1 https://ascopubs.org/toc/jco/10/1 2 https://ascopubs.org/toc/jco/10/2 3 https://ascopubs.org/toc/jco/10/3 4 https://ascopubs.org/toc/jco/10/4 5 https://ascopubs.org/toc/jco/10/5 6 https://ascopubs.org/toc/jco/10/6 7 https://ascopubs.org/toc/jco/10/7 8 https://ascopubs.org/toc/jco/10/8 9 https://ascopubs.org/toc/jco/10/9 10 https://ascopubs.org/toc/jco/10/10 11 https://ascopubs.org/toc/jco/10/11 12 https://ascopubs.org/toc/jco/10/12 

 25%|██▌       | 10/40 [03:20<10:47, 21.59s/it]

1 https://ascopubs.org/toc/jco/11/1 2 https://ascopubs.org/toc/jco/11/2 3 https://ascopubs.org/toc/jco/11/3 4 https://ascopubs.org/toc/jco/11/4 5 https://ascopubs.org/toc/jco/11/5 6 https://ascopubs.org/toc/jco/11/6 7 https://ascopubs.org/toc/jco/11/7 8 https://ascopubs.org/toc/jco/11/8 9 https://ascopubs.org/toc/jco/11/9 10 https://ascopubs.org/toc/jco/11/10 11 https://ascopubs.org/toc/jco/11/11 12 https://ascopubs.org/toc/jco/11/12 

 28%|██▊       | 11/40 [03:46<11:08, 23.05s/it]

1 https://ascopubs.org/toc/jco/12/1 2 https://ascopubs.org/toc/jco/12/2 3 https://ascopubs.org/toc/jco/12/3 4 https://ascopubs.org/toc/jco/12/4 5 https://ascopubs.org/toc/jco/12/5 6 https://ascopubs.org/toc/jco/12/6 7 https://ascopubs.org/toc/jco/12/7 8 https://ascopubs.org/toc/jco/12/8 9 https://ascopubs.org/toc/jco/12/9 10 https://ascopubs.org/toc/jco/12/10 11 https://ascopubs.org/toc/jco/12/11 12 https://ascopubs.org/toc/jco/12/12 

 30%|███       | 12/40 [04:12<11:09, 23.92s/it]

1 https://ascopubs.org/toc/jco/13/1 2 https://ascopubs.org/toc/jco/13/2 3 https://ascopubs.org/toc/jco/13/3 4 https://ascopubs.org/toc/jco/13/4 5 https://ascopubs.org/toc/jco/13/5 6 https://ascopubs.org/toc/jco/13/6 7 https://ascopubs.org/toc/jco/13/7 8 https://ascopubs.org/toc/jco/13/8 9 https://ascopubs.org/toc/jco/13/9 10 https://ascopubs.org/toc/jco/13/10 11 https://ascopubs.org/toc/jco/13/11 12 https://ascopubs.org/toc/jco/13/12 

 32%|███▎      | 13/40 [04:40<11:20, 25.19s/it]

1 https://ascopubs.org/toc/jco/14/1 2 https://ascopubs.org/toc/jco/14/2 3 https://ascopubs.org/toc/jco/14/3 4 https://ascopubs.org/toc/jco/14/4 5 https://ascopubs.org/toc/jco/14/5 6 https://ascopubs.org/toc/jco/14/6 7 https://ascopubs.org/toc/jco/14/7 8 https://ascopubs.org/toc/jco/14/8 9 https://ascopubs.org/toc/jco/14/9 10 https://ascopubs.org/toc/jco/14/10 11 https://ascopubs.org/toc/jco/14/11 12 https://ascopubs.org/toc/jco/14/12 

 35%|███▌      | 14/40 [05:09<11:27, 26.45s/it]

1 https://ascopubs.org/toc/jco/15/1 2 https://ascopubs.org/toc/jco/15/2 3 https://ascopubs.org/toc/jco/15/3 4 https://ascopubs.org/toc/jco/15/4 5 https://ascopubs.org/toc/jco/15/5 6 https://ascopubs.org/toc/jco/15/6 7 https://ascopubs.org/toc/jco/15/7 8 https://ascopubs.org/toc/jco/15/8 9 https://ascopubs.org/toc/jco/15/9 10 https://ascopubs.org/toc/jco/15/10 11 https://ascopubs.org/toc/jco/15/11 12 https://ascopubs.org/toc/jco/15/12 

 38%|███▊      | 15/40 [05:39<11:25, 27.42s/it]

1 https://ascopubs.org/toc/jco/16/1 2 https://ascopubs.org/toc/jco/16/2 3 https://ascopubs.org/toc/jco/16/3 4 https://ascopubs.org/toc/jco/16/4 5 https://ascopubs.org/toc/jco/16/5 6 https://ascopubs.org/toc/jco/16/6 7 https://ascopubs.org/toc/jco/16/7 8 https://ascopubs.org/toc/jco/16/8 9 https://ascopubs.org/toc/jco/16/9 10 https://ascopubs.org/toc/jco/16/10 11 https://ascopubs.org/toc/jco/16/11 12 https://ascopubs.org/toc/jco/16/12 

 40%|████      | 16/40 [06:14<11:55, 29.80s/it]

1 https://ascopubs.org/toc/jco/17/1 2 https://ascopubs.org/toc/jco/17/2 3 https://ascopubs.org/toc/jco/17/3 4 https://ascopubs.org/toc/jco/17/4 5 https://ascopubs.org/toc/jco/17/5 6 https://ascopubs.org/toc/jco/17/6 7 https://ascopubs.org/toc/jco/17/7 8 https://ascopubs.org/toc/jco/17/8 9 https://ascopubs.org/toc/jco/17/9 10 https://ascopubs.org/toc/jco/17/10 11 https://ascopubs.org/toc/jco/17/11 12 https://ascopubs.org/toc/jco/17/12 

 42%|████▎     | 17/40 [06:46<11:40, 30.47s/it]

1 https://ascopubs.org/toc/jco/18/1 2 https://ascopubs.org/toc/jco/18/2 3 https://ascopubs.org/toc/jco/18/3 4 https://ascopubs.org/toc/jco/18/4 5 https://ascopubs.org/toc/jco/18/5 6 https://ascopubs.org/toc/jco/18/6 7 https://ascopubs.org/toc/jco/18/7 8 https://ascopubs.org/toc/jco/18/8 9 https://ascopubs.org/toc/jco/18/9 10 https://ascopubs.org/toc/jco/18/10 11 https://ascopubs.org/toc/jco/18/11 12 https://ascopubs.org/toc/jco/18/12 13 https://ascopubs.org/toc/jco/18/13 14 https://ascopubs.org/toc/jco/18/14 15 https://ascopubs.org/toc/jco/18/15 16 https://ascopubs.org/toc/jco/18/16 17 https://ascopubs.org/toc/jco/18/17 18 https://ascopubs.org/toc/jco/18/18 19 https://ascopubs.org/toc/jco/18/19 20 https://ascopubs.org/toc/jco/18/20 21 https://ascopubs.org/toc/jco/18/21 22 https://ascopubs.org/toc/jco/18/22 23 https://ascopubs.org/toc/jco/18/23 24 https://ascopubs.org/toc/jco/18/24 

 45%|████▌     | 18/40 [07:29<12:33, 34.23s/it]

1 https://ascopubs.org/toc/jco/19/1 2 https://ascopubs.org/toc/jco/19/2 3 https://ascopubs.org/toc/jco/19/3 4 https://ascopubs.org/toc/jco/19/4 5 https://ascopubs.org/toc/jco/19/5 6 https://ascopubs.org/toc/jco/19/6 7 https://ascopubs.org/toc/jco/19/7 8 https://ascopubs.org/toc/jco/19/8 9 https://ascopubs.org/toc/jco/19/9 10 https://ascopubs.org/toc/jco/19/10 11 https://ascopubs.org/toc/jco/19/11 12 https://ascopubs.org/toc/jco/19/12 13 https://ascopubs.org/toc/jco/19/13 14 https://ascopubs.org/toc/jco/19/14 15 https://ascopubs.org/toc/jco/19/15 16 https://ascopubs.org/toc/jco/19/16 17 https://ascopubs.org/toc/jco/19/17 18 https://ascopubs.org/toc/jco/19/18 19 https://ascopubs.org/toc/jco/19/19 20 https://ascopubs.org/toc/jco/19/20 21 https://ascopubs.org/toc/jco/19/21 22 https://ascopubs.org/toc/jco/19/22 23 https://ascopubs.org/toc/jco/19/23 24 https://ascopubs.org/toc/jco/19/24 

 48%|████▊     | 19/40 [08:14<13:04, 37.36s/it]

1 https://ascopubs.org/toc/jco/20/1 2 https://ascopubs.org/toc/jco/20/2 3 https://ascopubs.org/toc/jco/20/3 4 https://ascopubs.org/toc/jco/20/4 5 https://ascopubs.org/toc/jco/20/5 6 https://ascopubs.org/toc/jco/20/6 7 https://ascopubs.org/toc/jco/20/7 8 https://ascopubs.org/toc/jco/20/8 9 https://ascopubs.org/toc/jco/20/9 10 https://ascopubs.org/toc/jco/20/10 11 https://ascopubs.org/toc/jco/20/11 12 https://ascopubs.org/toc/jco/20/12 13 https://ascopubs.org/toc/jco/20/13 14 https://ascopubs.org/toc/jco/20/14 15 https://ascopubs.org/toc/jco/20/15 16 https://ascopubs.org/toc/jco/20/16 17 https://ascopubs.org/toc/jco/20/17 18 https://ascopubs.org/toc/jco/20/18 19 https://ascopubs.org/toc/jco/20/19 20 https://ascopubs.org/toc/jco/20/20 21 https://ascopubs.org/toc/jco/20/21 22 https://ascopubs.org/toc/jco/20/22 23 https://ascopubs.org/toc/jco/20/23 24 https://ascopubs.org/toc/jco/20/24 

 50%|█████     | 20/40 [09:06<13:57, 41.87s/it]

1 https://ascopubs.org/toc/jco/21/1 2 https://ascopubs.org/toc/jco/21/2 3 https://ascopubs.org/toc/jco/21/3 4 https://ascopubs.org/toc/jco/21/4 5 https://ascopubs.org/toc/jco/21/5 6 https://ascopubs.org/toc/jco/21/6 7 https://ascopubs.org/toc/jco/21/7 8 https://ascopubs.org/toc/jco/21/8 9 https://ascopubs.org/toc/jco/21/9 10 https://ascopubs.org/toc/jco/21/10 11 https://ascopubs.org/toc/jco/21/11 12 https://ascopubs.org/toc/jco/21/12 13 https://ascopubs.org/toc/jco/21/13 14 https://ascopubs.org/toc/jco/21/14 15 https://ascopubs.org/toc/jco/21/15 16 https://ascopubs.org/toc/jco/21/16 17 https://ascopubs.org/toc/jco/21/17 18 https://ascopubs.org/toc/jco/21/18 19 https://ascopubs.org/toc/jco/21/19 20 https://ascopubs.org/toc/jco/21/20 21 https://ascopubs.org/toc/jco/21/21 22 https://ascopubs.org/toc/jco/21/22 23 https://ascopubs.org/toc/jco/21/23 24 https://ascopubs.org/toc/jco/21/24 

 52%|█████▎    | 21/40 [10:02<14:36, 46.11s/it]

1 https://ascopubs.org/toc/jco/22/1 2 https://ascopubs.org/toc/jco/22/2 3 https://ascopubs.org/toc/jco/22/3 4 https://ascopubs.org/toc/jco/22/4 5 https://ascopubs.org/toc/jco/22/5 6 https://ascopubs.org/toc/jco/22/6 7 https://ascopubs.org/toc/jco/22/7 8 https://ascopubs.org/toc/jco/22/8 9 https://ascopubs.org/toc/jco/22/9 10 https://ascopubs.org/toc/jco/22/10 11 https://ascopubs.org/toc/jco/22/11 12 https://ascopubs.org/toc/jco/22/12 13 https://ascopubs.org/toc/jco/22/13 14 https://ascopubs.org/toc/jco/22/14 15 https://ascopubs.org/toc/jco/22/15 16 https://ascopubs.org/toc/jco/22/16 17 https://ascopubs.org/toc/jco/22/17 18 https://ascopubs.org/toc/jco/22/18 19 https://ascopubs.org/toc/jco/22/19 20 https://ascopubs.org/toc/jco/22/20 21 https://ascopubs.org/toc/jco/22/21 22 https://ascopubs.org/toc/jco/22/22 23 https://ascopubs.org/toc/jco/22/23 24 https://ascopubs.org/toc/jco/22/24 

 55%|█████▌    | 22/40 [10:59<14:48, 49.35s/it]

1 https://ascopubs.org/toc/jco/23/1 2 https://ascopubs.org/toc/jco/23/2 3 https://ascopubs.org/toc/jco/23/3 4 https://ascopubs.org/toc/jco/23/4 5 https://ascopubs.org/toc/jco/23/5 6 https://ascopubs.org/toc/jco/23/6 7 https://ascopubs.org/toc/jco/23/7 8 https://ascopubs.org/toc/jco/23/8 9 https://ascopubs.org/toc/jco/23/9 10 https://ascopubs.org/toc/jco/23/10 11 https://ascopubs.org/toc/jco/23/11 12 https://ascopubs.org/toc/jco/23/12 13 https://ascopubs.org/toc/jco/23/13 14 https://ascopubs.org/toc/jco/23/14 15 https://ascopubs.org/toc/jco/23/15 16 https://ascopubs.org/toc/jco/23/16 17 https://ascopubs.org/toc/jco/23/17 18 https://ascopubs.org/toc/jco/23/18 19 https://ascopubs.org/toc/jco/23/19 20 https://ascopubs.org/toc/jco/23/20 21 https://ascopubs.org/toc/jco/23/21 22 https://ascopubs.org/toc/jco/23/22 23 https://ascopubs.org/toc/jco/23/23 24 https://ascopubs.org/toc/jco/23/24 25 https://ascopubs.org/toc/jco/23/25 26 https://ascopubs.org/toc/jco/23/26 27 https://ascopubs.org/toc/jc

 57%|█████▊    | 23/40 [12:33<17:46, 62.72s/it]

1 https://ascopubs.org/toc/jco/24/1 2 https://ascopubs.org/toc/jco/24/2 3 https://ascopubs.org/toc/jco/24/3 4 https://ascopubs.org/toc/jco/24/4 5 https://ascopubs.org/toc/jco/24/5 6 https://ascopubs.org/toc/jco/24/6 7 https://ascopubs.org/toc/jco/24/7 8 https://ascopubs.org/toc/jco/24/8 9 https://ascopubs.org/toc/jco/24/9 10 https://ascopubs.org/toc/jco/24/10 11 https://ascopubs.org/toc/jco/24/11 12 https://ascopubs.org/toc/jco/24/12 13 https://ascopubs.org/toc/jco/24/13 14 https://ascopubs.org/toc/jco/24/14 15 https://ascopubs.org/toc/jco/24/15 16 https://ascopubs.org/toc/jco/24/16 17 https://ascopubs.org/toc/jco/24/17 18 https://ascopubs.org/toc/jco/24/18 19 https://ascopubs.org/toc/jco/24/19 20 https://ascopubs.org/toc/jco/24/20 21 https://ascopubs.org/toc/jco/24/21 22 https://ascopubs.org/toc/jco/24/22 23 https://ascopubs.org/toc/jco/24/23 24 https://ascopubs.org/toc/jco/24/24 25 https://ascopubs.org/toc/jco/24/25 26 https://ascopubs.org/toc/jco/24/26 27 https://ascopubs.org/toc/jc

 60%|██████    | 24/40 [13:47<17:36, 66.04s/it]

1 https://ascopubs.org/toc/jco/25/1 2 https://ascopubs.org/toc/jco/25/2 3 https://ascopubs.org/toc/jco/25/3 4 https://ascopubs.org/toc/jco/25/4 5 https://ascopubs.org/toc/jco/25/5 6 https://ascopubs.org/toc/jco/25/6 7 https://ascopubs.org/toc/jco/25/7 8 https://ascopubs.org/toc/jco/25/8 9 https://ascopubs.org/toc/jco/25/9 10 https://ascopubs.org/toc/jco/25/10 11 https://ascopubs.org/toc/jco/25/11 12 https://ascopubs.org/toc/jco/25/12 13 https://ascopubs.org/toc/jco/25/13 14 https://ascopubs.org/toc/jco/25/14 15 https://ascopubs.org/toc/jco/25/15 16 https://ascopubs.org/toc/jco/25/16 17 https://ascopubs.org/toc/jco/25/17 18 https://ascopubs.org/toc/jco/25/18 19 https://ascopubs.org/toc/jco/25/19 20 https://ascopubs.org/toc/jco/25/20 21 https://ascopubs.org/toc/jco/25/21 22 https://ascopubs.org/toc/jco/25/22 23 https://ascopubs.org/toc/jco/25/23 24 https://ascopubs.org/toc/jco/25/24 25 https://ascopubs.org/toc/jco/25/25 26 https://ascopubs.org/toc/jco/25/26 27 https://ascopubs.org/toc/jc

 62%|██████▎   | 25/40 [15:04<17:22, 69.49s/it]

1 https://ascopubs.org/toc/jco/26/1 2 https://ascopubs.org/toc/jco/26/2 3 https://ascopubs.org/toc/jco/26/3 4 https://ascopubs.org/toc/jco/26/4 5 https://ascopubs.org/toc/jco/26/5 6 https://ascopubs.org/toc/jco/26/6 7 https://ascopubs.org/toc/jco/26/7 8 https://ascopubs.org/toc/jco/26/8 9 https://ascopubs.org/toc/jco/26/9 10 https://ascopubs.org/toc/jco/26/10 11 https://ascopubs.org/toc/jco/26/11 12 https://ascopubs.org/toc/jco/26/12 13 https://ascopubs.org/toc/jco/26/13 14 https://ascopubs.org/toc/jco/26/14 15 https://ascopubs.org/toc/jco/26/15 16 https://ascopubs.org/toc/jco/26/16 17 https://ascopubs.org/toc/jco/26/17 18 https://ascopubs.org/toc/jco/26/18 19 https://ascopubs.org/toc/jco/26/19 20 https://ascopubs.org/toc/jco/26/20 21 https://ascopubs.org/toc/jco/26/21 22 https://ascopubs.org/toc/jco/26/22 23 https://ascopubs.org/toc/jco/26/23 24 https://ascopubs.org/toc/jco/26/24 25 https://ascopubs.org/toc/jco/26/25 26 https://ascopubs.org/toc/jco/26/26 27 https://ascopubs.org/toc/jc

 65%|██████▌   | 26/40 [16:30<17:21, 74.37s/it]

1 https://ascopubs.org/toc/jco/27/1 2 https://ascopubs.org/toc/jco/27/2 3 https://ascopubs.org/toc/jco/27/3 4 https://ascopubs.org/toc/jco/27/4 5 https://ascopubs.org/toc/jco/27/5 6 https://ascopubs.org/toc/jco/27/6 7 https://ascopubs.org/toc/jco/27/7 8 https://ascopubs.org/toc/jco/27/8 9 https://ascopubs.org/toc/jco/27/9 10 https://ascopubs.org/toc/jco/27/10 11 https://ascopubs.org/toc/jco/27/11 12 https://ascopubs.org/toc/jco/27/12 13 https://ascopubs.org/toc/jco/27/13 14 https://ascopubs.org/toc/jco/27/14 15 https://ascopubs.org/toc/jco/27/15 16 https://ascopubs.org/toc/jco/27/16 17 https://ascopubs.org/toc/jco/27/17 18 https://ascopubs.org/toc/jco/27/18 19 https://ascopubs.org/toc/jco/27/19 20 https://ascopubs.org/toc/jco/27/20 21 https://ascopubs.org/toc/jco/27/21 22 https://ascopubs.org/toc/jco/27/22 23 https://ascopubs.org/toc/jco/27/23 24 https://ascopubs.org/toc/jco/27/24 25 https://ascopubs.org/toc/jco/27/25 26 https://ascopubs.org/toc/jco/27/26 27 https://ascopubs.org/toc/jc

 68%|██████▊   | 27/40 [17:55<16:46, 77.39s/it]

1 https://ascopubs.org/toc/jco/28/1 2 https://ascopubs.org/toc/jco/28/2 3 https://ascopubs.org/toc/jco/28/3 4 https://ascopubs.org/toc/jco/28/4 5 https://ascopubs.org/toc/jco/28/5 6 https://ascopubs.org/toc/jco/28/6 7 https://ascopubs.org/toc/jco/28/7 8 https://ascopubs.org/toc/jco/28/8 9 https://ascopubs.org/toc/jco/28/9 10 https://ascopubs.org/toc/jco/28/10 11 https://ascopubs.org/toc/jco/28/11 12 https://ascopubs.org/toc/jco/28/12 13 https://ascopubs.org/toc/jco/28/13 14 https://ascopubs.org/toc/jco/28/14 15 https://ascopubs.org/toc/jco/28/15 16 https://ascopubs.org/toc/jco/28/16 17 https://ascopubs.org/toc/jco/28/17 18 https://ascopubs.org/toc/jco/28/18 19 https://ascopubs.org/toc/jco/28/19 20 https://ascopubs.org/toc/jco/28/20 21 https://ascopubs.org/toc/jco/28/21 22 https://ascopubs.org/toc/jco/28/22 23 https://ascopubs.org/toc/jco/28/23 24 https://ascopubs.org/toc/jco/28/24 25 https://ascopubs.org/toc/jco/28/25 26 https://ascopubs.org/toc/jco/28/26 27 https://ascopubs.org/toc/jc

 70%|███████   | 28/40 [19:16<15:42, 78.53s/it]

1 https://ascopubs.org/toc/jco/29/1 2 https://ascopubs.org/toc/jco/29/2 3 https://ascopubs.org/toc/jco/29/3 4 https://ascopubs.org/toc/jco/29/4 5 https://ascopubs.org/toc/jco/29/5 6 https://ascopubs.org/toc/jco/29/6 7 https://ascopubs.org/toc/jco/29/7 8 https://ascopubs.org/toc/jco/29/8 9 https://ascopubs.org/toc/jco/29/9 10 https://ascopubs.org/toc/jco/29/10 11 https://ascopubs.org/toc/jco/29/11 12 https://ascopubs.org/toc/jco/29/12 13 https://ascopubs.org/toc/jco/29/13 14 https://ascopubs.org/toc/jco/29/14 15 https://ascopubs.org/toc/jco/29/15 16 https://ascopubs.org/toc/jco/29/16 17 https://ascopubs.org/toc/jco/29/17 18 https://ascopubs.org/toc/jco/29/18 19 https://ascopubs.org/toc/jco/29/19 20 https://ascopubs.org/toc/jco/29/20 21 https://ascopubs.org/toc/jco/29/21 22 https://ascopubs.org/toc/jco/29/22 23 https://ascopubs.org/toc/jco/29/23 24 https://ascopubs.org/toc/jco/29/24 25 https://ascopubs.org/toc/jco/29/25 26 https://ascopubs.org/toc/jco/29/26 27 https://ascopubs.org/toc/jc

 72%|███████▎  | 29/40 [20:40<14:41, 80.14s/it]

1 https://ascopubs.org/toc/jco/30/1 2 https://ascopubs.org/toc/jco/30/2 3 https://ascopubs.org/toc/jco/30/3 4 https://ascopubs.org/toc/jco/30/4 5 https://ascopubs.org/toc/jco/30/5 6 https://ascopubs.org/toc/jco/30/6 7 https://ascopubs.org/toc/jco/30/7 8 https://ascopubs.org/toc/jco/30/8 9 https://ascopubs.org/toc/jco/30/9 10 https://ascopubs.org/toc/jco/30/10 11 https://ascopubs.org/toc/jco/30/11 12 https://ascopubs.org/toc/jco/30/12 13 https://ascopubs.org/toc/jco/30/13 14 https://ascopubs.org/toc/jco/30/14 15 https://ascopubs.org/toc/jco/30/15 16 https://ascopubs.org/toc/jco/30/16 17 https://ascopubs.org/toc/jco/30/17 18 https://ascopubs.org/toc/jco/30/18 19 https://ascopubs.org/toc/jco/30/19 20 https://ascopubs.org/toc/jco/30/20 21 https://ascopubs.org/toc/jco/30/21 22 https://ascopubs.org/toc/jco/30/22 23 https://ascopubs.org/toc/jco/30/23 24 https://ascopubs.org/toc/jco/30/24 25 https://ascopubs.org/toc/jco/30/25 26 https://ascopubs.org/toc/jco/30/26 27 https://ascopubs.org/toc/jc

 75%|███████▌  | 30/40 [21:53<13:02, 78.21s/it]

1 https://ascopubs.org/toc/jco/31/1 2 https://ascopubs.org/toc/jco/31/2 3 https://ascopubs.org/toc/jco/31/3 4 https://ascopubs.org/toc/jco/31/4 5 https://ascopubs.org/toc/jco/31/5 6 https://ascopubs.org/toc/jco/31/6 7 https://ascopubs.org/toc/jco/31/7 8 https://ascopubs.org/toc/jco/31/8 9 https://ascopubs.org/toc/jco/31/9 10 https://ascopubs.org/toc/jco/31/10 11 https://ascopubs.org/toc/jco/31/11 12 https://ascopubs.org/toc/jco/31/12 13 https://ascopubs.org/toc/jco/31/13 14 https://ascopubs.org/toc/jco/31/14 15 https://ascopubs.org/toc/jco/31/15 16 https://ascopubs.org/toc/jco/31/16 17 https://ascopubs.org/toc/jco/31/17 18 https://ascopubs.org/toc/jco/31/18 19 https://ascopubs.org/toc/jco/31/19 20 https://ascopubs.org/toc/jco/31/20 21 https://ascopubs.org/toc/jco/31/21 22 https://ascopubs.org/toc/jco/31/22 23 https://ascopubs.org/toc/jco/31/23 24 https://ascopubs.org/toc/jco/31/24 25 https://ascopubs.org/toc/jco/31/25 26 https://ascopubs.org/toc/jco/31/26 27 https://ascopubs.org/toc/jc

 78%|███████▊  | 31/40 [23:06<11:27, 76.37s/it]

1 https://ascopubs.org/toc/jco/32/1 2 https://ascopubs.org/toc/jco/32/2 3 https://ascopubs.org/toc/jco/32/3 4 https://ascopubs.org/toc/jco/32/4 5 https://ascopubs.org/toc/jco/32/5 6 https://ascopubs.org/toc/jco/32/6 7 https://ascopubs.org/toc/jco/32/7 8 https://ascopubs.org/toc/jco/32/8 9 https://ascopubs.org/toc/jco/32/9 10 https://ascopubs.org/toc/jco/32/10 11 https://ascopubs.org/toc/jco/32/11 12 https://ascopubs.org/toc/jco/32/12 13 https://ascopubs.org/toc/jco/32/13 14 https://ascopubs.org/toc/jco/32/14 15 https://ascopubs.org/toc/jco/32/15 16 https://ascopubs.org/toc/jco/32/16 17 https://ascopubs.org/toc/jco/32/17 18 https://ascopubs.org/toc/jco/32/18 19 https://ascopubs.org/toc/jco/32/19 20 https://ascopubs.org/toc/jco/32/20 21 https://ascopubs.org/toc/jco/32/21 22 https://ascopubs.org/toc/jco/32/22 23 https://ascopubs.org/toc/jco/32/23 24 https://ascopubs.org/toc/jco/32/24 25 https://ascopubs.org/toc/jco/32/25 26 https://ascopubs.org/toc/jco/32/26 27 https://ascopubs.org/toc/jc

 80%|████████  | 32/40 [24:06<09:32, 71.57s/it]

1 https://ascopubs.org/toc/jco/33/1 2 https://ascopubs.org/toc/jco/33/2 3 https://ascopubs.org/toc/jco/33/3 4 https://ascopubs.org/toc/jco/33/4 5 https://ascopubs.org/toc/jco/33/5 6 https://ascopubs.org/toc/jco/33/6 7 https://ascopubs.org/toc/jco/33/7 8 https://ascopubs.org/toc/jco/33/8 9 https://ascopubs.org/toc/jco/33/9 10 https://ascopubs.org/toc/jco/33/10 11 https://ascopubs.org/toc/jco/33/11 12 https://ascopubs.org/toc/jco/33/12 13 https://ascopubs.org/toc/jco/33/13 14 https://ascopubs.org/toc/jco/33/14 15 https://ascopubs.org/toc/jco/33/15 16 https://ascopubs.org/toc/jco/33/16 17 https://ascopubs.org/toc/jco/33/17 18 https://ascopubs.org/toc/jco/33/18 19 https://ascopubs.org/toc/jco/33/19 20 https://ascopubs.org/toc/jco/33/20 21 https://ascopubs.org/toc/jco/33/21 22 https://ascopubs.org/toc/jco/33/22 23 https://ascopubs.org/toc/jco/33/23 24 https://ascopubs.org/toc/jco/33/24 25 https://ascopubs.org/toc/jco/33/25 26 https://ascopubs.org/toc/jco/33/26 27 https://ascopubs.org/toc/jc

 82%|████████▎ | 33/40 [25:08<08:00, 68.59s/it]

1 https://ascopubs.org/toc/jco/34/1 2 https://ascopubs.org/toc/jco/34/2 3 https://ascopubs.org/toc/jco/34/3 4 https://ascopubs.org/toc/jco/34/4 5 https://ascopubs.org/toc/jco/34/5 6 https://ascopubs.org/toc/jco/34/6 7 https://ascopubs.org/toc/jco/34/7 8 https://ascopubs.org/toc/jco/34/8 9 https://ascopubs.org/toc/jco/34/9 10 https://ascopubs.org/toc/jco/34/10 11 https://ascopubs.org/toc/jco/34/11 12 https://ascopubs.org/toc/jco/34/12 13 https://ascopubs.org/toc/jco/34/13 14 https://ascopubs.org/toc/jco/34/14 15 https://ascopubs.org/toc/jco/34/15 16 https://ascopubs.org/toc/jco/34/16 17 https://ascopubs.org/toc/jco/34/17 18 https://ascopubs.org/toc/jco/34/18 19 https://ascopubs.org/toc/jco/34/19 20 https://ascopubs.org/toc/jco/34/20 21 https://ascopubs.org/toc/jco/34/21 22 https://ascopubs.org/toc/jco/34/22 23 https://ascopubs.org/toc/jco/34/23 24 https://ascopubs.org/toc/jco/34/24 25 https://ascopubs.org/toc/jco/34/25 26 https://ascopubs.org/toc/jco/34/26 27 https://ascopubs.org/toc/jc

 85%|████████▌ | 34/40 [26:07<06:35, 65.92s/it]

1 https://ascopubs.org/toc/jco/35/1 2 https://ascopubs.org/toc/jco/35/2 3 https://ascopubs.org/toc/jco/35/3 4 https://ascopubs.org/toc/jco/35/4 5 https://ascopubs.org/toc/jco/35/5 6 https://ascopubs.org/toc/jco/35/6 7 https://ascopubs.org/toc/jco/35/7 8 https://ascopubs.org/toc/jco/35/8 9 https://ascopubs.org/toc/jco/35/9 10 https://ascopubs.org/toc/jco/35/10 11 https://ascopubs.org/toc/jco/35/11 12 https://ascopubs.org/toc/jco/35/12 13 https://ascopubs.org/toc/jco/35/13 14 https://ascopubs.org/toc/jco/35/14 15 https://ascopubs.org/toc/jco/35/15 16 https://ascopubs.org/toc/jco/35/16 17 https://ascopubs.org/toc/jco/35/17 18 https://ascopubs.org/toc/jco/35/18 19 https://ascopubs.org/toc/jco/35/19 20 https://ascopubs.org/toc/jco/35/20 21 https://ascopubs.org/toc/jco/35/21 22 https://ascopubs.org/toc/jco/35/22 23 https://ascopubs.org/toc/jco/35/23 24 https://ascopubs.org/toc/jco/35/24 25 https://ascopubs.org/toc/jco/35/25 26 https://ascopubs.org/toc/jco/35/26 27 https://ascopubs.org/toc/jc

 88%|████████▊ | 35/40 [26:55<05:02, 60.58s/it]

1 https://ascopubs.org/toc/jco/36/1 2 https://ascopubs.org/toc/jco/36/2 3 https://ascopubs.org/toc/jco/36/3 4 https://ascopubs.org/toc/jco/36/4 5 https://ascopubs.org/toc/jco/36/5 6 https://ascopubs.org/toc/jco/36/6 7 https://ascopubs.org/toc/jco/36/7 8 https://ascopubs.org/toc/jco/36/8 9 https://ascopubs.org/toc/jco/36/9 10 https://ascopubs.org/toc/jco/36/10 11 https://ascopubs.org/toc/jco/36/11 12 https://ascopubs.org/toc/jco/36/12 13 https://ascopubs.org/toc/jco/36/13 14 https://ascopubs.org/toc/jco/36/14 15 https://ascopubs.org/toc/jco/36/15 16 https://ascopubs.org/toc/jco/36/16 17 https://ascopubs.org/toc/jco/36/17 18 https://ascopubs.org/toc/jco/36/18 19 https://ascopubs.org/toc/jco/36/19 20 https://ascopubs.org/toc/jco/36/20 21 https://ascopubs.org/toc/jco/36/21 22 https://ascopubs.org/toc/jco/36/22 23 https://ascopubs.org/toc/jco/36/23 24 https://ascopubs.org/toc/jco/36/24 25 https://ascopubs.org/toc/jco/36/25 26 https://ascopubs.org/toc/jco/36/26 27 https://ascopubs.org/toc/jc

 90%|█████████ | 36/40 [27:38<03:40, 55.18s/it]

1 https://ascopubs.org/toc/jco/37/1 2 https://ascopubs.org/toc/jco/37/2 3 https://ascopubs.org/toc/jco/37/3 4 https://ascopubs.org/toc/jco/37/4 5 https://ascopubs.org/toc/jco/37/5 6 https://ascopubs.org/toc/jco/37/6 7 https://ascopubs.org/toc/jco/37/7 8 https://ascopubs.org/toc/jco/37/8 9 https://ascopubs.org/toc/jco/37/9 10 https://ascopubs.org/toc/jco/37/10 11 https://ascopubs.org/toc/jco/37/11 12 https://ascopubs.org/toc/jco/37/12 13 https://ascopubs.org/toc/jco/37/13 14 https://ascopubs.org/toc/jco/37/14 15 https://ascopubs.org/toc/jco/37/15 16 https://ascopubs.org/toc/jco/37/16 17 https://ascopubs.org/toc/jco/37/17 18 https://ascopubs.org/toc/jco/37/18 19 https://ascopubs.org/toc/jco/37/19 20 https://ascopubs.org/toc/jco/37/20 21 https://ascopubs.org/toc/jco/37/21 22 https://ascopubs.org/toc/jco/37/22 23 https://ascopubs.org/toc/jco/37/23 24 https://ascopubs.org/toc/jco/37/24 25 https://ascopubs.org/toc/jco/37/25 26 https://ascopubs.org/toc/jco/37/26 27 https://ascopubs.org/toc/jc

 92%|█████████▎| 37/40 [28:12<02:26, 48.83s/it]

1 https://ascopubs.org/toc/jco/38/1 2 https://ascopubs.org/toc/jco/38/2 3 https://ascopubs.org/toc/jco/38/3 4 https://ascopubs.org/toc/jco/38/4 5 https://ascopubs.org/toc/jco/38/5 6 https://ascopubs.org/toc/jco/38/6 7 https://ascopubs.org/toc/jco/38/7 8 https://ascopubs.org/toc/jco/38/8 9 https://ascopubs.org/toc/jco/38/9 10 https://ascopubs.org/toc/jco/38/10 11 https://ascopubs.org/toc/jco/38/11 12 https://ascopubs.org/toc/jco/38/12 13 https://ascopubs.org/toc/jco/38/13 14 https://ascopubs.org/toc/jco/38/14 15 https://ascopubs.org/toc/jco/38/15 16 https://ascopubs.org/toc/jco/38/16 17 https://ascopubs.org/toc/jco/38/17 18 https://ascopubs.org/toc/jco/38/18 19 https://ascopubs.org/toc/jco/38/19 20 https://ascopubs.org/toc/jco/38/20 21 https://ascopubs.org/toc/jco/38/21 22 https://ascopubs.org/toc/jco/38/22 23 https://ascopubs.org/toc/jco/38/23 24 https://ascopubs.org/toc/jco/38/24 25 https://ascopubs.org/toc/jco/38/25 26 https://ascopubs.org/toc/jco/38/26 27 https://ascopubs.org/toc/jc

 95%|█████████▌| 38/40 [28:49<01:30, 45.40s/it]

1 https://ascopubs.org/toc/jco/39/1 2 https://ascopubs.org/toc/jco/39/2 3 https://ascopubs.org/toc/jco/39/3 4 https://ascopubs.org/toc/jco/39/4 5 https://ascopubs.org/toc/jco/39/5 6 https://ascopubs.org/toc/jco/39/6 7 https://ascopubs.org/toc/jco/39/7 8 https://ascopubs.org/toc/jco/39/8 9 https://ascopubs.org/toc/jco/39/9 10 https://ascopubs.org/toc/jco/39/10 11 https://ascopubs.org/toc/jco/39/11 12 https://ascopubs.org/toc/jco/39/12 13 https://ascopubs.org/toc/jco/39/13 14 https://ascopubs.org/toc/jco/39/14 15 https://ascopubs.org/toc/jco/39/15 16 https://ascopubs.org/toc/jco/39/16 17 https://ascopubs.org/toc/jco/39/17 18 https://ascopubs.org/toc/jco/39/18 19 https://ascopubs.org/toc/jco/39/19 20 https://ascopubs.org/toc/jco/39/20 21 https://ascopubs.org/toc/jco/39/21 22 https://ascopubs.org/toc/jco/39/22 23 https://ascopubs.org/toc/jco/39/23 24 https://ascopubs.org/toc/jco/39/24 25 https://ascopubs.org/toc/jco/39/25 26 https://ascopubs.org/toc/jco/39/26 27 https://ascopubs.org/toc/jc

 98%|█████████▊| 39/40 [29:36<00:45, 45.87s/it]

1 https://ascopubs.org/toc/jco/40/1 2 https://ascopubs.org/toc/jco/40/2 3 https://ascopubs.org/toc/jco/40/3 4 https://ascopubs.org/toc/jco/40/4 5 https://ascopubs.org/toc/jco/40/5 6 https://ascopubs.org/toc/jco/40/6 7 https://ascopubs.org/toc/jco/40/7 8 https://ascopubs.org/toc/jco/40/8 9 https://ascopubs.org/toc/jco/40/9 10 https://ascopubs.org/toc/jco/40/10 11 https://ascopubs.org/toc/jco/40/11 12 https://ascopubs.org/toc/jco/40/12 13 https://ascopubs.org/toc/jco/40/13 14 https://ascopubs.org/toc/jco/40/14 15 https://ascopubs.org/toc/jco/40/15 16 https://ascopubs.org/toc/jco/40/16 17 https://ascopubs.org/toc/jco/40/17 18 https://ascopubs.org/toc/jco/40/18 19 https://ascopubs.org/toc/jco/40/19 20 https://ascopubs.org/toc/jco/40/20 21 https://ascopubs.org/toc/jco/40/21 22 https://ascopubs.org/toc/jco/40/22 23 https://ascopubs.org/toc/jco/40/23 24 https://ascopubs.org/toc/jco/40/24 25 https://ascopubs.org/toc/jco/40/25 26 https://ascopubs.org/toc/jco/40/26 27 https://ascopubs.org/toc/jc

100%|██████████| 40/40 [30:07<00:00, 45.19s/it]


In [16]:
# creatting a single list of all type of data
all_titles = []
all_authors = []
all_articles = []
all_years = []
year = 1983
for item in data:
    for year_data in item:
        all_titles += year_data[0]
        all_authors += year_data[1]
        all_articles += year_data[2]
        all_years += [str(year) for x in range(len(year_data[0]))]
    year += 1

# creating a dictionary of data
data_dic = {'title':all_titles, 'author': all_authors, 'pdf_url':all_articles, 'year':all_years}

In [17]:
print(len(all_titles))
print(len(all_authors))
print(len(all_articles))
print(len(all_years))

27325
27325
27325
27325


In [18]:
df = pd.DataFrame(data_dic)
df.head(10)

Unnamed: 0,title,author,pdf_url,year
0,A Journal for Oncologists,,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
1,Adjuvant CMF in breast cancer: comparative 5-y...,Tancini et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
2,Histologic conversion in the non-Hodgkin's lym...,Acker et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
3,Feasibility study of combining metronidazole w...,Stewart et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
4,Constant infusion schedule for adriamycin: a p...,Lokich et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
5,Serum cyclophosphamide activity in patients tr...,Sarpel et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
6,Scalp hypothermia: a comparison of ice packs a...,Dean et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
7,Evaluation of prognostic factors in chemothera...,Eagan et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
8,The response of Ewing's sarcoma to sequential ...,Hayes et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
9,Methylglyoxal-bis(guanylhydrazone) (Methyl-GAG...,Warrell et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983


In [19]:
df.to_csv("data.csv", index=False)

In [20]:
# read csv file 
df_data = pd.read_csv("data.csv")
df_data.shape


(27325, 4)

In [21]:
df_data.head()

Unnamed: 0,title,author,pdf_url,year
0,A Journal for Oncologists,,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
1,Adjuvant CMF in breast cancer: comparative 5-y...,Tancini et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
2,Histologic conversion in the non-Hodgkin's lym...,Acker et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
3,Feasibility study of combining metronidazole w...,Stewart et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
4,Constant infusion schedule for adriamycin: a p...,Lokich et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983


In [22]:
# remove columns with NaN values 
df_data = df_data.dropna()
df_data.head()

Unnamed: 0,title,author,pdf_url,year
1,Adjuvant CMF in breast cancer: comparative 5-y...,Tancini et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
2,Histologic conversion in the non-Hodgkin's lym...,Acker et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
3,Feasibility study of combining metronidazole w...,Stewart et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
4,Constant infusion schedule for adriamycin: a p...,Lokich et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
5,Serum cyclophosphamide activity in patients tr...,Sarpel et al.,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983


In [23]:
# remove the rows with empty title
df_data = df_data[df_data["title"] != ""]
df_data.head()
df_data.shape

# remove the rows with empty author
df_data = df_data[df_data["author"] != ""]
df_data.shape

# remove the rows with empty pdf_url
df_data = df_data[df_data["pdf_url"] != ""]
df_data.shape



(26245, 4)

In [24]:
# remove "et al." from the author column
df_data["author"] = df_data["author"].str.replace("et al.", "")
df_data.head()

  


Unnamed: 0,title,author,pdf_url,year
1,Adjuvant CMF in breast cancer: comparative 5-y...,Tancini,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
2,Histologic conversion in the non-Hodgkin's lym...,Acker,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
3,Feasibility study of combining metronidazole w...,Stewart,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
4,Constant infusion schedule for adriamycin: a p...,Lokich,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983
5,Serum cyclophosphamide activity in patients tr...,Sarpel,https://ascopubs.org/doi/pdf/10.1200/JCO.1983....,1983


In [25]:
# find the unique authors
unique_authors = df_data["author"].unique()
print(len(unique_authors)  )

# find the unique titles
unique_titles = df_data["title"].unique()
print(len(unique_titles))

# find the unique pdf_urls
unique_pdf_urls = df_data["pdf_url"].unique()
print(len(unique_pdf_urls))

# find the unique years
unique_years = df_data["year"].unique()
print(len(unique_years))



11493
25051
26114
40


In [26]:
# make a column for the title
df_data["title"] = df_data["title"].str.lower()
df_data.head()
# store tilte col in a list
title_list = df_data["title"].tolist()
title_list



['adjuvant cmf in breast cancer: comparative 5-year results of 12 versus 6 cycles.',
 "histologic conversion in the non-hodgkin's lymphomas.",
 'feasibility study of combining metronidazole with chemotherapy.',
 'constant infusion schedule for adriamycin: a phase i-ii clinical trial of a 30-day schedule by ambulatory pump delivery system.',
 'serum cyclophosphamide activity in patients treated for small cell carcinoma of the lung.',
 'scalp hypothermia: a comparison of ice packs and the kold kap in the prevention of doxorubicin-induced alopecia.',
 'evaluation of prognostic factors in chemotherapy of recurrent brain tumors.',
 "the response of ewing's sarcoma to sequential cyclophosphamide and adriamycin induction therapy.",
 'methylglyoxal-bis(guanylhydrazone) (methyl-gag): current status and future prospects.',
 'reflections on medical oncology: an appeal for better clinical trials and improved reporting of their results.',
 'karnofsky memorial lecture. breaking the cure barrier.',
 