# PDF text extraction tool
Author: Roald Teunissen

## Libraries

In [1]:
import PyPDF2
import pandas as pd
import os
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

## Properties

In [2]:
data_dir = os.path.join(os.getcwd(), '../../data/')

PAPERS_DIR = os.path.join(data_dir, 'external/papers')
PAPERS_DATA_DIR__RAW = os.path.join(data_dir, 'raw', 'papers_raw.csv')
PAPERS_DATA_DIR__PROCESSED = os.path.join(data_dir, 'processed', 'papers_processed.csv')

# 1. Extract content from PDF and save as raw

## Build dataset based on files in *directory*

In [3]:
raw_paper_data = []

# Loop through all topic folders
for topic in os.listdir(PAPERS_DIR): 
    
    # Loop individual files
    for filename in os.listdir(os.path.join(PAPERS_DIR, topic)):
        file_dir = os.path.join(PAPERS_DIR, topic, filename)

        # Only execute when it's a file
        if os.path.isfile(file_dir):
            pdf_file = open(file_dir,'rb')
            
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            n_pages = len(pdf_reader.pages)
            
            # Build str object to extract all the content from the pdf
            text = ''
            for i in range(0, n_pages):
                page = pdf_reader.pages[i]
                text += page.extract_text()

            filename = filename[:-4] # Remove .pdf file extension
            raw_paper_data.append({'title': filename, 'topic': topic, 'content': text})

df_papers = pd.DataFrame(raw_paper_data, columns = ['title', 'topic', 'content'])

Explore

In [4]:
df_papers.describe()

Unnamed: 0,title,topic,content
count,66,66,66
unique,66,2,65
top,1D N-type SnO2 nanofibers coexisted with P-typ...,chemistry,"\n 263 Psychiatria Danubina, 2022; Vol. 34, N..."
freq,1,33,2


In [7]:
df_papers['topic'].unique()

array(['chemistry', 'life sciences and biomedicine'], dtype=object)

In [6]:
df_papers.head()

Unnamed: 0,title,topic,content
0,1D N-type SnO2 nanofibers coexisted with P-typ...,chemistry,Content from this work may be used under the t...
1,A reconfigurable integrated electronic tongue ...,chemistry,arXiv:2205.15018v1 [cs.LG] 27 May 2022\n© 20...
2,AMixedMethods Research Agenda to Identify Unde...,chemistry,A Mixed Methods Research Agenda to Identify Un...
3,"Computational study of Cu2+, Fe2+, Fe3+, Mn2+ ...",chemistry,Content from this work may be used under the t...
4,Corrosion inhibitory properties of La0.5Ca0.5M...,chemistry,Content from this work may be used under the t...


## Save raw data

In [9]:
df_papers.to_csv(PAPERS_DATA_DIR__RAW, index=False)

********

# 2. Data cleaning

In [11]:
# Optional: Load dataset if the variable is not available
# Load when file is found
if os.path.isfile(PAPERS_DATA_DIR__RAW):
    df_papers = pd.read_csv(PAPERS_DATA_DIR__RAW)
else:
    raise FileNotFoundError('Raw papers dataset is not found')

### Pre-processing

Common text preprocessing

In [12]:
# Convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() # lowercase text
    text = text.strip()  # get rid of leading/trailing whitespace
    text = re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  # replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  # remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ',text) # [0-9] matches any digit (0 to 10000...)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) # matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) # \s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

Lexicon-based text processing

In [13]:
# Stopword removal
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

# Stemming
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)

# Lemmatization
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
# Full list is available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

# Final processing step
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

In [15]:
# Initial preprocessing steps over the content from the papers
df_papers['content'] = df_papers['content'].apply(lambda x: finalpreprocess(x))

In [16]:
df_papers.to_csv(PAPERS_DATA_DIR__PROCESSED, index=False)