# PubMed Papers: Preprocessing Cleaned Data for NLP Analysis

In [14]:
import pandas as pd
import numpy as np


#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction import text
import re

import warnings
warnings.filterwarnings('ignore')

In [15]:
#nltk.download('stopwords')

In [16]:
#pull in the cleaned data with no outliers
filename = '../data/pubmed_cleaned_no_outliers.csv'

In [17]:
df = pd.read_csv(filename).drop(columns = 'Unnamed: 0')

In [18]:
df.set_index('pmid', inplace=True)

In [19]:
df.columns

Index(['title', 'text', 'date', 'citations', 'len_text', 'len_title',
       'days_live'],
      dtype='object')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2809 entries, 22627698 to 33291844
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      2809 non-null   object
 1   text       2809 non-null   object
 2   date       2809 non-null   object
 3   citations  2809 non-null   int64 
 4   len_text   2809 non-null   int64 
 5   len_title  2809 non-null   int64 
 6   days_live  2809 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 175.6+ KB


In [22]:
df['text'][22627698]

"Applying Fourier-transform infrared (FTIR) spectroscopy (or related technologies such as Raman spectroscopy) to biological questions (defined as biospectroscopy) is relatively novel. Potential fields of application include cytological, histological and microbial studies. This potentially provides a rapid and non-destructive approach to clinical diagnosis. Its increase in application is primarily a consequence of developing instrumentation along with computational techniques. In the coming decades, biospectroscopy is likely to become a common tool in the screening or diagnostic laboratory, or even in the general practitioner's clinic. Despite many advances in the biological application of FTIR spectroscopy, there remain challenges in sample preparation, instrumentation and data handling. We focus on the latter, where we identify in the reviewed literature, the existence of four main study goals: Pattern Finding; Biomarker Identification; Imaging; and, Diagnosis. These can be grouped in

#### Combine Title & Abstract

In [23]:
df['fulltext'] = (df['title'] + ' ' + df['text'])

In [24]:
df = df.drop(columns = ['title', 'text'])

In [25]:
df.head()

Unnamed: 0_level_0,date,citations,len_text,len_title,days_live,fulltext
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22627698,2012-07-01,35,1780,165,3233,Extracting biological information with computa...
22952238,2012-09-01,4,793,81,3171,Uncovering transcription factor modules using ...
22944687,2012-09-01,19,1698,64,3171,Understanding the substrate specificity of con...
22075226,2012-09-01,12,1024,42,3171,Membrane protein structural bioinformatics Des...
23012584,2012-09-01,10,935,60,3171,The future of medical diagnostics: large digit...


#### Remove Punctuation & Make Lowercase

In [26]:
df['fulltext'] = df['fulltext'].map(lambda x: re.sub("[''·,\.!?'-]", '', x).lower())

#### Tokenize Text

In [27]:
w_tokenizer = WhitespaceTokenizer()

In [28]:
df['fulltext'] = df['fulltext'].apply(lambda x: w_tokenizer.tokenize(x))

In [29]:
df.head()

Unnamed: 0_level_0,date,citations,len_text,len_title,days_live,fulltext
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22627698,2012-07-01,35,1780,165,3233,"[extracting, biological, information, with, co..."
22952238,2012-09-01,4,793,81,3171,"[uncovering, transcription, factor, modules, u..."
22944687,2012-09-01,19,1698,64,3171,"[understanding, the, substrate, specificity, o..."
22075226,2012-09-01,12,1024,42,3171,"[membrane, protein, structural, bioinformatics..."
23012584,2012-09-01,10,935,60,3171,"[the, future, of, medical, diagnostics:, large..."


#### Remove Special Characters 

In [30]:
df['fulltext'] = df['fulltext'].replace('[^\w]','',regex=True)

In [31]:
df.head()

Unnamed: 0_level_0,date,citations,len_text,len_title,days_live,fulltext
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22627698,2012-07-01,35,1780,165,3233,"[extracting, biological, information, with, co..."
22952238,2012-09-01,4,793,81,3171,"[uncovering, transcription, factor, modules, u..."
22944687,2012-09-01,19,1698,64,3171,"[understanding, the, substrate, specificity, o..."
22075226,2012-09-01,12,1024,42,3171,"[membrane, protein, structural, bioinformatics..."
23012584,2012-09-01,10,935,60,3171,"[the, future, of, medical, diagnostics:, large..."


#### Remove English Stopwords

In [32]:
stop_words = set(stopwords.words('english')) 

In [33]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [34]:
df['fulltext'] = df['fulltext'].apply(remove_stopwords)

#### Lemmatize

Uncomment this section to save the file with lemmatized text.

In [35]:
# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(w) for w in text]

In [36]:
# for col in text_cols:
#     df[col] = df[col].apply(lemmatize_text)

In [37]:
# save as lemmatized
# output_filename = 'data/nlp_nltk_lemmatized_preproc.csv'

#### Porter Stemmer
Uncomment this section to save the file with stemmed text.

In [38]:
def stem_text(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in text]

In [39]:
df['fulltext'] = df['fulltext'].apply(stem_text)

In [40]:
output_filename = '../data/pubmed_nltk_stemmed_preproc.csv'

In [42]:
print(df['fulltext'][22627698])

['extract', 'biolog', 'inform', 'comput', 'analysi', 'fouriertransform', 'infrar', '(ftir)', 'biospectroscopi', 'datasets:', 'current', 'practic', 'futur', 'perspect', 'appli', 'fouriertransform', 'infrar', '(ftir)', 'spectroscopi', '(or', 'relat', 'technolog', 'raman', 'spectroscopy)', 'biolog', 'question', '(defin', 'biospectroscopy)', 'rel', 'novel', 'potenti', 'field', 'applic', 'includ', 'cytolog', 'histolog', 'microbi', 'studi', 'potenti', 'provid', 'rapid', 'nondestruct', 'approach', 'clinic', 'diagnosi', 'increas', 'applic', 'primarili', 'consequ', 'develop', 'instrument', 'along', 'comput', 'techniqu', 'come', 'decad', 'biospectroscopi', 'like', 'becom', 'common', 'tool', 'screen', 'diagnost', 'laboratori', 'even', 'gener', 'practition', 'clinic', 'despit', 'mani', 'advanc', 'biolog', 'applic', 'ftir', 'spectroscopi', 'remain', 'challeng', 'sampl', 'prepar', 'instrument', 'data', 'handl', 'focu', 'latter', 'identifi', 'review', 'literatur', 'exist', 'four', 'main', 'studi', 'g

#### No Stemming
Uncomment this section to save the file without any lemmatization or stemmed. 

In [None]:
# #output filename for no lemmatizing or stemming
# output_filename = '../data/pubmed_nltk_nostem_preproc_strings.csv'

In [None]:
# df['fulltext'] = [' '.join(x) for x in df['fulltext']]

#### Save Pre-Processed File

In [57]:
df.head()

Unnamed: 0_level_0,date,citations,len_text,len_title,days_live,fulltext
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22627698,2012-07-01,35,1780,165,3233,"[extract, biolog, inform, comput, analysi, fou..."
22952238,2012-09-01,4,793,81,3171,"[uncov, transcript, factor, modul, use, one, t..."
22944687,2012-09-01,19,1698,64,3171,"[understand, substrat, specif, convent, calpai..."
22075226,2012-09-01,12,1024,42,3171,"[membran, protein, structur, bioinformat, desp..."
23012584,2012-09-01,10,935,60,3171,"[futur, medic, diagnostics:, larg, digit, data..."


In [58]:
df.to_csv(output_filename)