<a href="https://colab.research.google.com/github/ajurberg/article-parser/blob/main/05_text_cleaning_gdf11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
############################# 
#@title Installation of libraries 
#############################
import os
from time import time
import re, string, unicodedata
from string import punctuation
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
############################# 
#@title Mounting Google Drive
#############################
from google.colab import drive
drive._mount('/content/drive')

import os
workdir_path = '/content/drive/My Drive/papers'
os.chdir(workdir_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
############################# 
#@title Read json file to dictionary
#############################
import json

with open("gdf11-dictionary.json") as json_file:
  gdf11_dict = json.load(json_file)

len(gdf11_dict)

334

In [None]:
############################# 
#@title Text preprocessing
#############################

def find_between(s, first, last):
  try:
    start = s.index(first) + len(first)
    end = s.index(last, start)
    return s[start:end]
  except ValueError:
    return ""

def cleaning_text(string):
  string = string.lower()
  string = string.strip().replace("\n", ' ') #r
  string = string.strip().replace('\t', ' ') #r
  string = string.strip().replace(r"\u", ' ')
  string = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "", string) # removes e-mails
  string = re.sub("(fig. \d+)", "", string) #r
  string = string.strip().replace('®', '')
  string = re.sub("•", "", string) #r
  string = re.sub("\*", "", string) #r
  string = re.sub("·", "", string) #r
  string = re.sub("�+", " ", string)
  string = re.sub("● ▶+", "", string)
  #string = re.sub(r"\[(.*?)\]+", "", string) # replaces text between brackets
  string = re.sub("\[(\d*?)\]+", "", string) # replaces digits between brackets #r
  string = re.sub("\((\d*?)\)+", "", string) # replaces digits between parentheses (and also the parentheses) #r
  string = re.sub("(https?://|)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", "", string) # Removes urls #r
  string = re.sub('\b\w{1,2}\b', '', string) # removes words =< 2 characters #r
  string = re.sub(' +', ' ', string) # replaces multiple consecutive white spaces
  string = string.strip().replace("- ", "")
  return string

In [None]:
string = "e-mail: sjlee@jhmi.edu"
#email = r"([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+"
#email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email = r"[^@\s]+@[^@\s]+\.[a-zA-Z0-9]+$"

print(re.sub(email, "", string))


e-mail: 


In [None]:
# Denoise text
def strip_html(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

# Removing URLs
def remove_url(text):
  return re.sub(r'http\S+', '', text)

# Removing the noisy text
def denoise_text(text):
  text = strip_html(text)
  text = remove_url(text)
  text = text.strip().replace("- ", "")
  return text

In [None]:
############################# 
#@title NLTK and stopwords
#############################
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Read csv file - I'll get info that will be used as stopwords
df = pd.read_csv("2022-01-05_csv-gdf11-set.csv")
df.head()

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI
0,34951250,[Preliminary study on cerebrospinal fluid prot...,"Li HZ, Zeng NX, Liu KG, Luo WL, Lu WJ, Wu LL.",Zhongguo Zhong Yao Za Zhi. 2021 Dec;46(23):623...,Li HZ,Zhongguo Zhong Yao Za Zhi,2021,2021/12/24,,,10.19540/j.cnki.cjcmm.20210918.401
1,34905649,Growth differentiation factor 11 accelerates l...,"Sun J, Li Y, Yang X, Dong W, Yang J, Hu Q, Zha...",Aging Cell. 2021 Dec 14:e13532. doi: 10.1111/a...,Sun J,Aging Cell,2021,2021/12/14,,,10.1111/acel.13532
2,34809067,Detection of GDF11 by using a Ti(3)C(2)-MXene-...,"Liu C, Wang R, Shao Y, Chen C, Wu P, Wei Y, Ga...",Opt Express. 2021 Oct 25;29(22):36598-36607. d...,Liu C,Opt Express,2021,2021/11/23,,,10.1364/OE.440585
3,34791251,Gonadal rejuvenation of mice by GDF11,"Zhou Y, Ni S, Li C, Song L, Zhang S.",J Gerontol A Biol Sci Med Sci. 2021 Nov 13:gla...,Zhou Y,J Gerontol A Biol Sci Med Sci,2021,2021/11/18,,,10.1093/gerona/glab343
4,34712387,GDF-11 Protects the Traumatically Injured Spin...,"Xu Y, Hu X, Li F, Zhang H, Lou J, Wang X, Wang...",Oxid Med Cell Longev. 2021 Oct 19;2021:8186877...,Xu Y,Oxid Med Cell Longev,2021,2021/10/29,PMC8548157,,10.1155/2021/8186877


In [None]:
# To generate a list of author names (will be used as stopwords later)
authors = []
for author in df.Authors:
  author = author.lower().strip().split(",")
  for name in author:
    name = name.split(" ")
    for n in name:
      n = n.strip().replace(".", "")
      if len(n) <= 2 or n in authors:
        pass
      else:
        authors.append(n)      

# To flatten multiple list into one
#authors = [author.replace('.', '') for sublist in authors for author in sublist]
print(f"List size: {len(authors)}")

List size: 1600


In [None]:
# To generate a list of Journal/Book (will be used as stopwords later)
journals = []
for journal in df['Journal/Book']:
  journal = journal.lower().strip()
  # since 'development' is also an important noun, I removed it from going into the list
  if journal == "development" or journal in journals:
    pass
  else:
    journals.append(journal)

print(f"List size: {len(journals)}")

List size: 256


In [None]:
# Stopwords from stopwords-json
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en_mod = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#stopwords_punct = set(punctuation)

# Combine the stopwords
stop = set.union(stopwords_json_en, stopwords_nltk_en_mod) # stopwords_punct

# Add new stopwords - just include new stopwords in the list as needed
new_stops = [
             "manuscript", "articles", "article", "author", "authors", "et al.", "al.", "doi", "doi:", "print", "online", "journal",
             "title", "abstract", "introduction", "materials", "methods", "results", "discussion", "acknowledgements",
             "funding", "references", "review", "letter", "commentary", "supplementary material",
             "figures", "figure", "fig", "page", "pages","volume", "vol.", "vol.:", "untitled", "tables", "table", 
             "january", "february","march", "april", "may", "june", "july", "august", "september","october","november","december",
             "suggest", "may", "within", "thus", "med", "whether", "transl", "also", "correspondence", "to",
             "science journals", "aaas", "eaat3504", "microsoft", "sci-hub", "e-mail:", "grant", "grants",
             "copyright", "accepted", "publication", "full peer" "copyediting", "typesetting", "pagination", "proofreading",
             "©", "society", "rights", "data shown", "subscriptions", "reprints", "document", "permission", "permissions", "request", "requests", "requested",
             "web", "services", "office", "published", "click", "rightslink", "editorial", "protected copyright", "protected copyright reserved",
             "word", "307521dr1_pap_10_21_15", "10.1161/circresaha.115.307521", "addressed", "to", "id",
             "roche", "invitrogen"
             ]

# Add new stop words to stop
for n in new_stops:
  stop.add(n)

# Add author names to stop
for a in authors:
  stop.add(a)

# Add journal names to stop
for j in journals:
  stop.add(j)

print(f"List size: {len(stop)} words")

List size: 2549 words


In [None]:
# To count how many times the words 'introduction' and 'references' appear in each paper
counter = {}
for key, text in tqdm(gdf11_dict.items()):
  # Creates a 'counter' dictionary of keys to values == 0
  counter[key] = dict.fromkeys(["introduction", "references"], 0)
  text = cleaning_text(text)
  if "introduction" in text:
    counter[key]['introduction'] += 1
  if "references" in text:
    counter[key]['references'] += 1

counter_df = pd.DataFrame.from_dict(counter).T
counter_df.head()
#counter_df.to_excel("counter.xlsx")

# 232 papers have both words 'introduction' and 'references' (69.25%)
# 239 papers have the word 'introduction' (71.34%)
# 300 papers have the word 'references' (89.55%)
# Conclusion: it makes sense to use find_between() function, but I also have to consider the remaining papers

100%|██████████| 334/334 [00:03<00:00, 100.42it/s]


Unnamed: 0,introduction,references
10320,0,0
dbio.1998.9191,1,1
dbio.2000.9926,1,1
CIRCRESAHA.115.307521,1,1
AD.2019.0610,0,1


In [None]:
# To create a dictionary with cleaned texts
cleaned_dict = {}
for key, text in tqdm(gdf11_dict.items()):
  text = cleaning_text(text)
  text = denoise_text(text)
  if "introduction" in text and "references" in text:
    cleaned_dict[key] = find_between(text, "introduction", "references") # Get text between 'introduction and 'references'
  if "introduction" not in text and "references" in text:
    cleaned_dict[key] = text.partition("references")[0] # Get text before 'references'
  else:
    cleaned_dict[key] = text

100%|██████████| 334/334 [00:04<00:00, 75.59it/s]


In [None]:
# To remove stopwords
cleaned_dict_stop = {}
for key, text in tqdm(cleaned_dict.items()):
  text = " ".join([word for word in text.split(" ") if word not in stop])
  cleaned_dict_stop[key] = text

100%|██████████| 334/334 [00:00<00:00, 558.10it/s]


In [None]:
# To visualize a few examples
count = 0
for key, text in tqdm(cleaned_dict.items()):
  print(f"{key}: {text}")
  count += 1
  if count == 5:
    break

  1%|          | 4/334 [00:00<00:00, 7887.74it/s]

10320: letter 260 nature genetics volume 22 july 1999 regulation of anterior/posterior patterning of the axial skeleton by growth/differentiation factor 11 alexandra c. mcpherron1, ann m. lawler2 & se-jin lee1 departments of 1molecular biology and genetics and 2gynecology and obstetrics, johns hopkins university school of medicine, 725 n. wolfe street, baltimore, maryland 21205, usa. correspondence should be addressed to s.-j.l. (e-mail: ). the bones that comprise the axial skeleton have distinct morphological features characteristic of their positions along the anterior/posterior axis. we previously described a novel tgf-β family member, myostatin (encoded by the gene mstn, formerly gdf8), that has an essential role in regulating skeletal muscle mass1. we also identified a gene related to mstn by low-stringency screening1. while the work described here was being completed, the cloning of this gene, designated gdf11 (also called bmp11), was also reported by other groups2,3. here we sho




In [None]:
# To visualize a few examples - stopwords were removed
count = 0
for key, text in tqdm(cleaned_dict_stop.items()):
  print(f"{key}: {text}")
  count += 1
  if count == 5:
    break

  1%|          | 4/334 [00:00<00:00, 6091.94it/s]

10320: 260 genetics 22 1999 regulation anterior/posterior patterning axial skeleton growth/differentiation factor 11 alexandra c. mcpherron1, ann m. lawler2 & se-jin lee1 departments 1molecular biology genetics 2gynecology obstetrics, johns university school medicine, 725 n. wolfe street, baltimore, maryland 21205, usa. s.-j.l. (e-mail: ). bones comprise axial skeleton distinct morphological features characteristic positions anterior/posterior axis. previously tgf-β family member, myostatin (encoded mstn, gdf8), essential role regulating skeletal muscle mass1. identified related mstn low-stringency screening1. work completed, cloning gene, designated gdf11 (also called bmp11), reported groups2,3. show gdf11, transforming growth factor β (tgfβ) superfamily member, important role establishing skeletal pattern. early mouse embryogenesis, gdf11 expressed primitive streak tail bud regions, sites mesodermal cells generated. homozygous mutant mice carrying targeted deletion gdf11 exhibit ante




In [None]:
############################# 
#@title Sentence tokenization
#############################

# To create a dictionary with tokenized sentences
sent_dict = {}
for key, text in tqdm(cleaned_dict.items()):
  sent_dict[key] = sent_tokenize(text)

# Save dictionary to json file
gdf11_sent = open("gdf11-dictionary-sent-tokens.json", "w")
json.dump(sent_dict, gdf11_sent)
gdf11_sent.close()

100%|██████████| 334/334 [00:06<00:00, 48.45it/s]


In [None]:
# To create a dictionary with tokenized sentences without stopwords
sent_dict_stop = {}
for key, text in tqdm(cleaned_dict_stop.items()):
  sent_dict_stop[key] = sent_tokenize(text)

# Save dictionary to json file
gdf11_sent_stop = open("gdf11-dictionary-sent-tokens-stop.json", "w")
json.dump(sent_dict_stop, gdf11_sent_stop)
gdf11_sent_stop.close()

100%|██████████| 334/334 [00:06<00:00, 49.64it/s]


In [None]:
############################# 
#@title Word tokenization
#############################

# To create a dictionary with tokenized words
word_dict = {}
for key, text in tqdm(cleaned_dict.items()):
  word_dict[key] = word_tokenize(text)

# Save dictionary to json file
gdf11_word = open("gdf11-dictionary-word-tokens.json", "w")
json.dump(word_dict, gdf11_word)
gdf11_word.close()

100%|██████████| 334/334 [00:24<00:00, 13.85it/s]


In [None]:
# To create a dictionary with tokenized words without stopwords
word_dict_stop = {}
for key, text in tqdm(cleaned_dict_stop.items()):
  word_dict_stop[key] = word_tokenize(text)

# Save dictionary to json file
gdf11_word_stop = open("gdf11-dictionary-word-tokens-stop.json", "w")
json.dump(word_dict_stop, gdf11_word_stop)
gdf11_word_stop.close()

100%|██████████| 334/334 [00:20<00:00, 15.92it/s]


In [None]:
# count = 0
# for key, text in tqdm(word_dict_stop.items()):
#   print(f"{key}: {text}")
#   count += 1
#   if count == 5:
#     break