# FIT5196 Assessment 2
## Text Pre-Processing & Feature Generation

#### Student Name: Akshatha Shivashankar Chindalur
#### Student ID: 29996503
#### Student Name: Pradnya Alchetti
#### Student ID: 29595916

Date: 02/09/2019

Version: 1.0

Environment: Python 3.7.11 and Jupyter notebook

Libraries used: please include the main libraries you used in your assignment here, e.g.,:
* pdfminer(for dataframe, included in Anaconda Python 2.7) 
* nltk (for regular expression, included in Anaconda Python 2.7) 
* re (for numpy array, included in Anaconda Python 2.7) 

## Task 1: Generating a sparse matrix for Paper Bodies

### Importing the libraries necessary for generation of the sparse matrix.

In [14]:
# The required libraries are imported below.

import os
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from nltk.tokenize import RegexpTokenizer 
from nltk.tokenize import MWETokenizer
import re
import requests
import nltk
import nltk.data
import pandas as pd
nltk.download('punkt')
from itertools import chain
from functools import partial
from nltk.probability import *

[nltk_data] Downloading package punkt to /home/pradnya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Generating the corpus of papers as PDF files.

The given dataset containing 200 URLs of published papers is read from a PDF file. From these URLs, the corresponding papers are downloaded into the **papers** directory (folder).

In [2]:
#   This function downloads the paper from its respective URL as a PDF file. The downloaded file is
#   is stored in the directory (folder) papers.
#
#   :param url:    the URL (link) of the website from which the published paper needs to be downloaded.
#
#   :return contents:   a string (contents of the paper) obtained after processing the PDF file.
 
def write_to_pdf(url):
    request_pdf = requests.get(url)
    name = request_pdf.headers['content-disposition']
    f_name = re.search("filename=\"(.*)\"", name)

    with open('papers/'+ f_name.group(1),'wb') as f_pdf:
        f_pdf.write(request_pdf.content)   

In [3]:
#   This function extracts the contents from the given PDF file
#
#   :param pdf_path:    the path of the PDF file (paper) from which the data needs to be extracted.
#
#   :return contents:   a string (contents of the paper) obtained after processing the PDF file.

def retrieve_from_pdf(pdf_path):
    
    resource_manager = PDFResourceManager()
    file_fail_handle = StringIO()
    codec = 'utf-8'
    params = LAParams()
    converter = TextConverter(resource_manager, file_fail_handle, codec=codec, laparams=params)
    fp = open(pdf_path, 'rb')
    pdf_interpreter = PDFPageInterpreter(resource_manager, converter)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        pdf_interpreter.process_page(page)

    contents = file_fail_handle.getvalue()

    fp.close()
    converter.close()
    file_fail_handle.close()
    
    return contents

#### Step 1: Obtain 200 URLs from the given dataset.

In [None]:
dataset = retrieve_from_pdf('./Group113.pdf')

#### Step 2: Download papers from their respective URLs as PDF files.  

In [None]:
if not os.path.exists('./papers'):
    os.mkdir('./papers')
    
urls = re.findall('https:.*', dataset)

for each_url in urls:
    write_to_pdf(each_url)

### Sparse Feature Generation

#### 1. Obtain the bodies of the 200 papers in the corpus.

In [4]:
#   This function retrieves only the body of the paper (ignores the title, author and references).
#   It also performs certain pre-processing steps that clean up the text obtained from the PDF extractor.
#
#   :param pdf_file:    the PDF file (or paper) that needs to parsed
#
#   :return paper_body:   a string of the paper body obtained after the intial cleansing process.

def get_paper_content(pdf_file, regex):
    
    # the contents from the PDF file is obtained.
    pdf_text = retrieve_from_pdf(pdf_file)
    
    # from the processed PDF files, only the content that matches the regular 
    # expression is retrieved
    paper_content = re.search(regex, pdf_text).group(1)
    
    # some of the words continue onto the next line. These words contain a '-' between them, 
    # whcih can be identified with the regular expression '-\\n'. Thus, this is replaced with
    # a null character such that the new word is a whole.For instance, the word 'man-\nually' 
    # become 'manually' and the word 'compar-\nisons' becomes 'comparisons'.
    paper_content = re.sub('-\\n','', paper_content)
    
    # every new page begins with the special character '\x0c'. This is removed by replacing it
    # with a null character.
    paper_content = re.sub('\\x0c', '', paper_content)
    
    # the page numbers can be identified with the the regular expression '\\n(\d+)\\n\\n' which
    # is removed by replacing it with a null character.
    paper_content = re.sub('\\n(\d+)\\n\\n','',paper_content)
    
    # all the single new line characters are replaced with a space.
    paper_content = re.sub('\n',' ', paper_content)
    
    # lastly, the a single or multiple inline reference such as [1] or [22, 23] can be identified 
    # with the regular expression '(\s\[(\d+)(,\s*\d+)*\])' which is then replaced by a null 
    # character.
    paper_content = re.sub('(\s\[(\d+)(,\s*\d+)*\])','', paper_content)


    return paper_content

#### 2. Sentence Segmentation: tokens are normalised to lower case except the one appearing in the middle of a sentence.

In [5]:
#   This function segments the given string of text into sentences.
#
#   :param text:    the pre-processed body of the paper from the downloaded PDF files.
#
#   :return sentences:   an array of strings - each a sentence from the paper body.

def get_sentences(text):
    
    detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = detector.tokenize(text.strip())
    
    return sentences

#   This function converts all tokens to lower case except the ones appearing in the middle
#   of a sentence.
#
#   :param raw_text:    the cleansed body of the paper after being extracted from 
#                       its respective PDF file.
#
#   :return normalise_sentence:   an array of strings - each normalised to lower case.

def case_normalisation(raw_text):
    
    sentences = get_sentences(raw_text)
    
    normalise_sentence = []
    
    for sentence in sentences:
        normalise_sentence.append(sentence.replace(sentence[0],sentence[0].lower(),1))

    return normalise_sentence

#### 3. Word Tokenization: using the regular expression "[A-Za-z]\w+(?:[-'?]\w+)?"

In [6]:
#   This function tokenises the text based on the regular expression "[A-Za-z]\w+(?:[-'?]\w+)?".
#
#   :param text:    the text which needs to be tokenized.
#
#   :return tokens:   an array of unigram tokens (strings).

def get_tokens(text):
    
    tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?")
    tokens = tokenizer.tokenize(text)
    
    return tokens

#   This function genrates a list of unigram tokens that contain only alphabetic characters for 
#   every document (or paper) in the corpus.
#
#   :param text:    the text which needs to be tokenized.
#
#   :return doc_tokens:   a list of alphabetic tokens for the respective document.

def tokenize(normalised_text):
    
    doc_tokens = []
    
    for sentence in normalised_text:
        tokens = get_tokens(sentence)
        doc_tokens.extend(tokens)
        
    doc_tokens = [token for token in doc_tokens if token.isalpha()]
    
    return doc_tokens

In order to generate the sparse matrix, firstly each document in the corpus has to be tokenised individually. This is done with the help of 4 threads running parallelly. Further speeding up the tokenization process.

In [7]:
#   This function first segments each document into sentences and then generates a 
#   corresponding list of tokens
#
#   :param file_path:    the path of the PDF file that needs to be processed.
#
#   :return filename:    the name of the file just processed (used as a key)
#           tokens_list: list of tokens obtained by processing this file.   

def tokenize_paper_body_from_docs(file_path):
    
    # obtain the name of the document being parsed
    name_of_file = os.path.basename(file_path)
    
    # get only the body of the paper for feature extraction
    # This is done with the help of a regular expression that 
    # identifies the group between Paper Body and References.
    paper_body = get_paper_content(file_path, 'Paper Body([\s\S]*)\d\sReferences')
    
    # tokens normalised to lower case excluding the ones in the middle of a 
    # sentence.
    normalised_text = case_normalisation(paper_body)
    
    # list of tokens obtained after processing the document.
    list_tokens = tokenize(normalised_text)
    
    
    return name_of_file, list_tokens
    
import multiprocessing as mp
import glob

# building a pool of 4 processes
pool = mp.Pool(processes = 4) 

# obtaining the list of file names from the 'papers' directory.
filenames = glob.glob('./papers/*.pdf')

# dictionary of tokenised documents with key as the file name and value as the list
# of tokens.
tokenized_data = dict(pool.map(tokenize_paper_body_from_docs, filenames))

Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/

In [8]:
# genrating a corpus of tokens from all the 200 documents in the 'papers' directory

combine_tokens = []

for each_doc in tokenized_data.values():
    combine_tokens.extend(each_doc)
    
len(combine_tokens)

757220

In [9]:
len(tokenized_data.values())

200

#### 4. Bigrams: extracting 200 meaningful bigrams from the above generated token corpus.

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(combine_tokens)
bigrams = finder.nbest(bigram_measures.pmi, 300)
bigrams

Generate context independent stop words

In [10]:
# This functions reads the given set of stop words
def get_stopwords():
    with open('stopwords_en.txt', 'r') as stop_word_file:
        return set(stop_word_file.read().splitlines())
    

In [None]:
stop = get_stopwords()

bigrams = [(x,y) for x,y in bigrams if (x not in stop and y not in stop)] 

len(bigrams)

Generate a corpus of words from fetching tokens from all the documents

In [None]:
corpus_of_pdfs = []
for each_doc in tokenized_data:
    corpus_of_pdfs.extend(tokenized_data[each_doc])
len(corpus_of_pdfs)

In [None]:
# This function will remove the tokens with the length less than 3 

def remove_tokens_less_than_len_3(token_list):
    for token in token_list:
        if len(token) < 3:
            token_list.remove(token)
    return token_list

We compare the tokens of each document with the stopwords set given and remove them if present

In [11]:
def filter_context_independent_stopwords(tokens_list):
    stopwords = get_stopwords()

    filtered_context_independent = [] 

    for w in tokens_list: 
        if w not in stopwords: 
            filtered_context_independent.append(w)

    return filtered_context_independent

Tokenize each document tokens to include bigrams

In [None]:
mwe_tokenizer = MWETokenizer(bigrams)

#for tokens in filtered_context_independent:
tokenized_data = mwe_tokenizer.tokenize(filtered_context_independent)
#for doc,tokens in tokenized_data.items():
#    tokenized_data[doc] = mwe_tokenizer.tokenize(tokens)
tokenized_data

In [None]:
rare_tokens = remove_tokens_less_than_len_3(tokenized_data)
rare_tokens

In [None]:
# Stemming Using Porter Stemmer

from nltk.stem import PorterStemmer

ps = PorterStemmer()
stem_tokens = []

for w in rare_tokens:
    stem_tokens.append(ps.stem(w))
    
stem_tokens

Generate context dependent stop words

In [None]:
# context dependent words
words_2 = list(chain.from_iterable([set(value) for value in tokenized_data.values()]))
fd_2 = FreqDist(words_2)
l = fd_2.most_common(800)
len(l)

## Task 2

In [12]:
#   This function first segments each document into sentences and then generates a 
#   corresponding list of tokens
#
#   :param file_path:    the path of the PDF file that needs to be processed.
#
#   :return filename:    the name of the file just processed (used as a key)
#           tokens_list: list of tokens obtained by processing this file.   

def tokenize_content_from_docs(regex, content_type, file_path):
    
    # obtain the name of the document being parsed
    name_of_file = os.path.basename(file_path)
    
    # get only the content of the paper for feature extraction
    content = get_paper_content(file_path, regex)
    
    # Check the content type
    if(content_type == "abstract"):
        
        # tokens normalised to lower case excluding the ones in the middle of a 
        # sentence.
        normalised_text = case_normalisation(content)
        
        # list of tokens obtained after processing the document.
        list_tokens = tokenize(normalised_text)
       
    elif(content_type == "title"):
        
        # tokens are all normalised to lowercase. 
        normalised_text = content.lower()
        
        # list of tokens obtained after processing the document.
        list_tokens = get_tokens(normalised_text)
    
    
    return name_of_file, list_tokens
 
# you can remove this once the entire code is tested
# import multiprocessing as mp
# import glob
# from functools import partial
# # building a pool of 4 processes
# pool = mp.Pool(processes = 4) 

# # obtaining the list of file names from the 'papers' directory.
# filenames = glob.glob('./papers/*.pdf')



In [17]:
# get only the abstract of the paper for feature extraction
# This is done with the help of a regular expression that 
# identifies the group between Abstract and Paper Body.

# dictionary of tokenised documents with key as the file name and value as the list of tokens.

tokenized_data_abstract = dict(pool.map(partial(tokenize_content_from_docs, 'Abstract([\s\S]*)\d\sPaper Body','abstract'), filenames))

In [18]:
# get only the title of the paper for feature extraction
# This is done with the help of a regular expression that 
# identifies the group that starts between A-Z or a-z or '(' and Authored by.

# dictionary of tokenised documents with key as the file name and value as the list
# of tokens.


tokenized_data_title = dict(pool.map(partial(tokenize_content_from_docs, '(^[A-Za-z(][\s\S]*)\\n\\nAuthored by','title'), filenames))

In [19]:
#   This function first retrieves the author names from the paper and then generates a 
#   corresponding list of tokens
#
#   :param file_path:    the path of the PDF file that needs to be processed.
#
#   :param regex: the regular expression to be used for parsing the file.
#
#   :return filename:    the name of the file just processed (used as a key)
#           tokens_list: list of tokens obtained by processing this file. 
def get_authors_list(regex, file_path):
    
    # obtain the name of the document being parsed
    name_of_file = os.path.basename(file_path)
    
    # the contents from the PDF file is obtained.
    pdf_text = retrieve_from_pdf(file_path)
    
    # from the processed PDF files, extract the content on the basis of the regular expression.
    paper_author = re.search(regex, pdf_text).group(1)
    
    authors = paper_author.split('\n')
    
    return name_of_file, authors

In [21]:
# get only the authors of the paper for feature extraction
# This is done with the help of a regular expression that 
# identifies the group that starts between Authored by and Abstract.

# dictionary of tokenised documents with key as the file name and value as the list
# of tokens.


authors_data = dict(pool.map(partial(get_authors_list, 'Authored by:([\s\S]*)\\n\\nAbstract'), filenames))

In [22]:
# remove empty elements from the authors list
for each in authors_data:
    author_list = []
    for i in range(len(authors_data[each])):
        if authors_data[each][i] != '':
            author_list.append(authors_data[each][i])
    authors_data[each] = author_list


In [23]:
def filter_stopwords(tokenized_data):
    for each in tokenized_data:
        tokenized_data[each] = filter_context_independent_stopwords(tokenized_data[each])
    return tokenized_data

In [24]:
# After removing the context independent stop words
# we retrieve all the words in all the documents and calculate the frequency of each word

def get_most_common_words(tokenized_data, content_type):
    
    # Check content type and filter stopwords for abstract and title
    if content_type == 'abstract' or content_type == 'title':
        tokenized_data = filter_stopwords(tokenized_data)
    
    # create a list of words from all the documents
    words = list(chain.from_iterable(tokenized_data.values()))
    
    # retrieve the 10 most common words
    freq_dist = FreqDist(words)
    most_common = freq_dist.most_common(10)
    
    # Create a list of top 10 common words
    top_ten = []
    for word in most_common:
        top_ten.append(word[0])
    return top_ten

In [25]:
# get top 10 most occurring terms in abstract
top_abstract = get_most_common_words(tokenized_data_abstract,'abstract')

# get top 10 most occurring terms in title
top_title = get_most_common_words(tokenized_data_title,'title')

# get top 10 authors
top_author = get_most_common_words(authors_data,'')

In [26]:

# create a dataframe for the statistics
stats_data = {'top10_terms_in_abstracts':top_abstract,'top10_terms_in_titles':top_title,'top10_authors':top_author}

data_frame = pd.DataFrame(stats_data)

# write to csv
data_frame.to_csv("Group113_stats.csv", encoding='utf-8', index=False)

## 3. Summary
Give a short summary of your work done above, such as your findings.