In [1]:
#Import libraries
import re # regex is imported as it is useful in getting the text that is required and also for tokinzation
import requests
import io
import pdfminer # pdfminer is imported as it is useful for extracting text from the pdf files
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import nltk # nltk is the library for natural laguage processing
from nltk.collocations import * # collocations are used to identify the bigrams
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer # tokenize the text using a regular expression
from nltk.tokenize import MWETokenizer # MWEtokenizer is for Multi Word Expressions (bigrams)

In [2]:
#Get the file links from the pdf document
#define function for extracting text because we will be using it often
pdfminer_lap = pdfminer.layout.LAParams()
setattr(pdfminer_lap, 'all_texts', True)
# function to extract text from pdf
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager() # creating a resource manager object
    file_handle = io.StringIO() 
    text_converter = TextConverter(resource_manager, file_handle, laparams=pdfminer_lap) # creating a text convertor object
    interpreter = PDFPageInterpreter(resource_manager, text_converter) 
    # opening the pdf file
    with open(pdf_path, 'rb') as pdf:
        # for loop is used to loop over each page of the pdf to extract the text
        for page in PDFPage.get_pages(pdf, caching=True,check_extractable=True):
            interpreter.process_page(page)
        text = file_handle.getvalue() 
    # close open handles
    text_converter.close()
    file_handle.close()
 
    if text:
        return text
# Using the above function that uses pdfminer, the Group155.pdf file is extracted 
links_extract = extract_text_from_pdf('inp.pdf')
#Use regex to remove unwanted tags and column headers
links_extract = re.sub('<.*?>', '', links_extract)
links_extract = re.sub('\\x0c', '', links_extract)
links_extract = re.sub('filename', '', links_extract)
links_extract = re.sub('url', '', links_extract)
#Get the file names
file_name = re.findall('PP[0-9]+',links_extract)
#remove extracted and redundant info so that links can be extracted easily
links_extract = re.sub('PP[0-9]+.pdf', '', links_extract)
links = [] # creating a list
for link in links_extract.split('\n'):
    if link != '':
        links.append(link) # store all the links in a list

In [3]:
#Download the pdf files using the links we extracted
for index in range(len(links)):
    link = links[index] # getting the URL for the file saved in links dictionary
    r = requests.get(link, allow_redirects=True)
    open(file_name[index] + '.pdf', 'wb').write(r.content)

### Extracting the required text  
The required text i.e the text in the paper body section of the files is extracted and it is then cleaned by removing the  unnecessary characters like the hexcode, "\n", etc and replacing the ligatures with their ascii counter parts. Ligatures refer to 2 or more characters that are joined into one. The extracted text has some words that have ligatures. Examples of ligatures are 'fi' and 'ff'. Unidecode function is used to decode these ligatures.
After cleaning the text, sentence segmentation is applied. Sentence segmentation is the proceess to split text into sentences that end with a '!' or '.' or '?'. NLTK's Punk tokenizer contains a pre-trained sentence tokenizer for English. Hence, this is used to split the text into sentences.

In [4]:
#Extract text from pdf and store in dictionary with the filename as the key
#Use the unidecode package to remove the ligatures i.e to decode the unicode symbols to their closest ascii counter part
from unidecode import unidecode
body_text = {} # creating a dictionary to store the text from each file
# importing the data for nltk's punkt tokenizer for setence segmentation.
import nltk.data 
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')# nltk punk tokenizer data is loaded into a variable
# Using the for loop to extract the text from each of 200 files, clean the text and store it in a dictionary
for index in range(len(links)):
    text = extract_text_from_pdf(file_name[index] + '.pdf') 
    # Using regular expression to extract paper body section of the pdf file      
    text = re.search('(?s)Paper Body(.*?)2 References',text).group(1) 
    # Using Unidecode function from the unidecode library that is imported to remove the ligatures
    text = unidecode(text) 
    # Replacing the words that are within "<>" with a space as a part of cleaning the text
    text = re.sub('<.*?>', '', text) 
    # Replacing the hexcode with a space as a part of cleaning
    text = re.sub('\\x0c', '', text) 
    # Replacing the words that contain a hyphen and occur at the end of the line
    text = re.sub('-\n','',text) 
    # Replacing the words that contain a hyphen and have newline character at the end
    text = re.sub('-\n\n','',text) 
    temp = []  # creating an empty list
    sentences = sent_detector.tokenize(text.strip()) # punkt tokenizer is used on the text that is extracted
    # for loop to convert the first letter of the first word of the sentence into lower case    
    for sentence in sentences:
        temp.append(sentence[0].lower() + sentence[1:]) 
    text = ' '.join(temp) 
    # the final list is then stored in the dictionary of body_text for each file
    body_text[file_name[index]] = text 

### Tokenizing the text

We are going to break the text down to its words.

In [5]:
#Tokenize the text and create another dictinary to store them with the same keys
# Using the regular expression given in the assignment question, nltk's tokenizer is used to get the text into tokens
tokeniser = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?")
#create a function to tokenise the text
def tokenise(file):
    # tokenise the text for each file in stored in the dictionary in the previous cell
    tokenised_text = tokeniser.tokenize(body_text[file])
    #Return a tuple for saving as dictionary key-value pair
    return (file, tokenised_text)
# create a dictionary with the tokens for each file
text_tokenised = dict(tokenise(file) for file in file_name)
# create a list to store all the tokens from all tehe files
all_tokens = list(chain.from_iterable(text_tokenised.values()))

### Bigrams
Multiple words that occur together are collocations. Here, bigrams are extracted using nltk functions and then the PMI (Pointwise Mutual Information ) measure is used to identify the commonly occuring bigrams
First we extract the common 600 bigrams then we remove the bigrams having stopwords and then using PMI we extract the common 200 bigrams. These words are then added into the vocabulary using MWE tokenizer.

In [6]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(all_tokens)
token_bigram = finder.nbest(bigram_measures.pmi, 600)
# stopwords given in the text file are stored in a list
stopwords = open("stopwords_en.txt",'r')
stopwords_list = stopwords.readlines()
stop_words = []
# for loop is used to remove the "\n" character at the end of all stopwords
for stopword in stopwords_list:
        # stopwords given in the text file are stored in a list
        stopword = stopword.rstrip('\n')
        stop_words.append(stopword)
        stop_words.append(stopword.capitalize()) # The first letter of the stop word is capitalized and added to the list
bigrams = [] # creating a list to store the bigrams with all the stopwords removed
# for loop to loop over the bigram collocations that are extracted previously and take only the bigrams without any 
# stopwords into the bigrams list
for token in token_bigram:
    # "(?s)(\w+)(',)" regular Expression looks for the first word in the bigram 
    # and "(?s)(, ')(\w+)" looks for the second word in the bigram. 
    #The if condition checks if either the first word or the second word of the bigram is in the stopwords   
    if ((re.search("(?s)(\w+)(',)", str(token)).group(1)) or (re.search("(?s)(, ')(\w+)", str(token)).group(2))) in stop_words:
        pass
    else: 
        bigrams.append(token) # if there are no stopwrods in the bigram, add the bigram to the list of bigrams
# The above created list of bigrams is used to calculate the 200 most common bigrams using PMI measure       
tokens2 = list(chain.from_iterable(bigrams))
bigram_measures2 = nltk.collocations.BigramAssocMeasures()
finder2 = nltk.collocations.BigramCollocationFinder.from_words(tokens2)
token_bigram2 = finder.nbest(bigram_measures2.pmi, 200)
# Using MWE tokenizer, the 200 bigrams are added into the vocabulary
mwetokenizer = MWETokenizer(token_bigram2)
# a dictionary is created and using the MWE tokenizer, the bigrams from each file are stored in it
colloc_patents =  dict((pid, mwetokenizer.tokenize(text)) for pid,text in text_tokenised.items())
all_words_colloc = list(chain.from_iterable(colloc_patents.values()))
colloc_voc = list(set(all_words_colloc))
colloc_voc = [w for w in colloc_voc if w not in stop_words]

### Removing the stop words (context independent)
Removing the words that carry no significance

In [7]:
# Here the most common words are found using the FreqDist() method from a set of word tokens.
# It gives the frequency distribution based on term frequency i.e number of times the word occured in the entire corpus
from nltk.probability import *
words_2 = list(chain.from_iterable([set(value) for value in colloc_patents.values()])) # set of word tokens
words_2 = [w for w in words_2 if w not in stop_words] # taking the words that are not stop words
fd_2 = FreqDist(words_2) # Using freqdist() function to get the frequency of the words
y = fd_2.most_common() # most_common() method gives the most frequent words and their frequencies

### Removing rare words (3% threshold) and stop words- context dependent (95% threshold)
Context dependent stopwords with threshold 95% refers to the words that appear in more than 95% of the documents i.e 95% of 200 which is equal to 190 documents. Rare words with 3% threshold refers to the words that appear in more than 3% of the documents i.e 3% of 200 which is equal to 6. Hence the words that appear in greater than 6 documents and less than 190 documents are useful for us. The other words are not that useful as they may not provide much information about the topic.

In [8]:
rare_plenty = [] # creating an empty list
# for loop to loop over the most common words that are extracted in the previous cell
for tup in y:
# condition to check if the word frequency is within the desired range
    if tup[1] > 190 or tup[1] < 6: 
        rare_plenty.append(tup[0]) # taking all the rare words and stop words in a list
colloc_voc = [w for w in colloc_voc if w not in rare_plenty] # if the word is not in the above list, it is added to the vocab

### Removing words less than 3 in length


In [9]:
# if the word length is greater than 3, it is added to the vocabulary
colloc_voc = [w for w in colloc_voc if len(w) >= 3]

In [10]:
# vocabulary is then sorted in alphabetical order
colloc_voc.sort()

### Stemming the words
Stemming reduces the words in different lexical format to its base word. This helps us to get the words with the same root together. For this task, porter stemmer is used below. It is imported from nltk and it is applied on the vocabulary created above

In [11]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
cap_track = []
# for loop is used to loop over the words in the vocabulary and extract the words that start with a capital letter
for w in colloc_voc:
    if w[0].isupper():
        cap_track.append(True) # if the word starts with a capital letter, it is added to the list
    else:
        cap_track.append(False)
# for loop is used to loop over the vocabulary to check for bigrams. If the word is not a bigram, it is stemmed and added to
# the vocabulary
for i in range(len(colloc_voc)):
    if '_' in colloc_voc[i]:
        continue
    else:
        colloc_voc[i] = ps.stem(colloc_voc[i])
# stemmer lowers the word by default. Hence the words that were previously starting with a capital letter are also in 
# lowercase by default after stemming. The below code checks if the word was starting with a capital letter by checking the 
# list that was created above and capitalizes the stemmed word
    if cap_track[i] == True:
        colloc_voc[i] = colloc_voc[i].capitalize()

Below, stemming  is applied on the dictinary that contains tokens from each file so as to find the frequency of the words in the vocabuary within the documents. This helps in forming the sparse representation

In [12]:
# for loop is used to loop over all the files and stem the unigrams found in each file which are then saved in a dictionary
for file in file_name:
    tokens = colloc_patents[file]
    cap_track = []
    # for loop is used to loop over the words in the vocabulary and extract the words that start with a capital letter
    for w in tokens:
        if w[0].isupper():
            cap_track.append(True)
        else:
            cap_track.append(False)
    # for loop is used to loop over the vocabulary to check for bigrams. If the word is not a bigram, it is stemmed 
    #and added tothe vocabulary         
    for i in range(len(tokens)):
        if '_' in tokens[i]:
            continue
        else:
            tokens[i] = ps.stem(tokens[i]) 
# stemmer lowers the word by default. Hence the words that were previously starting with a capital letter are also in 
# lowercase by default after stemming. The below code checks if the word was starting with a capital letter by checking the 
# list that was created above and capitalizes the stemmed word
        if cap_track[i] == True:
            tokens[i] = tokens[i].capitalize()
    colloc_patents[file] = tokens

The preprocessing is now completed. Every document has to be converted into a numeric representation so that it can be used in the next steps of text mining algorithm. Below we use CountVectorizer that gives a matrix of token counts for a set of documents.

In [13]:
# A vocabulary list with all unique words is created below and it is sorted in alphabetical order
colloc_voc=list(set(colloc_voc))
colloc_voc.sort()

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer object is created and initialized with our vocabulary
vect = CountVectorizer(vocabulary=colloc_voc,lowercase=False) 
# data frame is created with the file names as index and the words from vocabulary as words
df = pd.DataFrame(index=file_name,columns=colloc_voc)
# fit_transform method fits the model and learns the vocabulary. It then transforms the text data into feature vectors.
# The input to fit_transform has to be a list of strings but we have stored the text as list of tokens, join is used to 
# concatenate the words and separate them with spaces. 
for file in file_name:
    dtm = vect.fit_transform([' '.join(colloc_patents[file])])
    df.loc[file] = dtm.toarray()

### Assignment Task 1
2 files have to be generated in task 1. They are the vocab_file and sparse count vectors file

In [14]:
# The vocab_file is created
vocab_file = open('Group155_vocab.txt', 'w+')
# for loop is used to loop over the words in the vocabulary to replace the single underscore in the bigrams by 
# double underscore as per the assignment specification
for i in range(len(colloc_voc)):
    if '_' in colloc_voc[i]:
        colloc_voc[i] = re.sub('_','__',colloc_voc[i])
    vocab_file.write(colloc_voc[i] + ':' + str(i) + '\n') # writing all the words in vocabulary to the file
vocab_file.flush() # drain the output buffer
vocab_file.close()

In [15]:
# The sparse count vector file is generated
count_vec = open('Group155_count_vectors.txt', 'w+')
for file in file_name:
    count_vec.write(file + ',')
    comma = False
    for i in range(len(colloc_voc)):
        if df.loc[file,colloc_voc[i]] > 0:
            count_vec.write(str(i) + ':' + str(df.loc[file,colloc_voc[i]]))
            if i != len(colloc_voc)-1:
                comma = True
        if comma == True:
            count_vec.write(',')
        comma = False
    count_vec.write('\n')
count_vec.flush() # drain the output buffer
count_vec.close()

### Assignment Task 2

In [16]:
# As per the second task, text for authors, abstracts and titles have to be extracted
abstracts ={}
titles = {}
authors = {}
# using for loop to loop over all the files and get the text about authors, abstracts and titles
for index in range(len(links)):
    text = extract_text_from_pdf(file_name[index] + '.pdf')
    text = unidecode(text) # ligatures are removed using the unidecode function
    title = re.search('(?s)(.*?)Authored by:',text).group(1) # text for authors is extracted 
    title = title.lower() # converted to lower case
    titles[file_name[index]] = title 
    abstract = re.search('(?s)Abstract(.*?)1 Paper Body',text).group(1) # text for abstract is extracted
    temp = []
    # punkt sentence tokenizer is used to convert the first letter of the starting word of a sentence to lower case
    sentences = sent_detector.tokenize(abstract.strip())
    for sentence in sentences:
        temp.append(sentence[0].lower() + sentence[1:])
    abstract = ' '.join(temp)
    abstracts[file_name[index]] = abstract
    author = re.search('(?s)Authored by:(.*?)Abstract',text).group(1)
    author = re.findall('[A-Za-z]+ [A-Za-z-]+',author)
    authors[file_name[index]] = author

In [17]:
# text has to be tokenized for each file and it is then saved in dictionaries for abstracts and titles 
def tokenise(file,dic):
    #tokenise the text
    tokenised_text = tokeniser.tokenize(dic[file])
    #Return a tuple for saving as dictionary key-value pair
    return (file, tokenised_text) 
abstract_tokenised = dict(tokenise(file,abstracts) for file in file_name) 
title_tokenised = dict(tokenise(file,titles) for file in file_name)
# lists are created to store the tokens from all the files 
abstract_tokens = list(chain.from_iterable(abstract_tokenised.values()))
title_tokens = list(chain.from_iterable(title_tokenised.values()))

#### 10 Frequently appearing terms in the abstract

In [18]:
# Top ten most frequent terms appearing in abstract
words_2 = list(chain.from_iterable([set(value) for value in abstract_tokenised.values()]))
words_2 = [w for w in words_2 if w not in stop_words] # stop words are removed
fd_2 = FreqDist(words_2) # FreqDist method used to get the term frequencies
freq_abs = fd_2.most_common() # words with their frequencies are extracted
freq_abs = freq_abs[0:10] # top 10 words are taken
top_10_abst_terms = []
for tup in freq_abs:
    top_10_abst_terms.append(tup[0]) #only extract the words
top_10_abst_terms #display results 
    

['show',
 'data',
 'model',
 'paper',
 'learning',
 'algorithm',
 'results',
 'based',
 'approach',
 'problem']

#### 10 Frequently appearing terms in the Titles

In [19]:
# Top ten most frequent terms appearing in abstract
words_2 = list(chain.from_iterable([set(value) for value in title_tokenised.values()]))
words_2 = [w for w in words_2 if w not in stop_words] # stop words are removed
fd_2 = FreqDist(words_2) # FreqDist method used to get the term frequencies
t_freq = fd_2.most_common() # words with their frequencies are extracted
t_freq = t_freq[0:10] # take the top ten
top_10_title_terms = []
for tup in t_freq:
    top_10_title_terms.append(tup[0]) #only extract the words
top_10_title_terms #display results 

['learning',
 'models',
 'inference',
 'latent',
 'gaussian',
 'neural',
 'optimization',
 'process',
 'variational',
 'networks']

#### 10 Most Frequent authors

In [20]:
# Top 10 frequent authors
author_tokens = list(chain.from_iterable(authors.values()))
from collections import Counter
author_count = Counter(author_tokens) 
top_authors = []
l = sorted(author_count.items(), key=lambda x:x[0]) # sort the results by name
l = sorted(l, key=lambda x:x[1], reverse = True) # and then by count
l = l [0:10]
for tup in l:
    top_authors.append(tup[0])
top_authors #display results 

['Eric P',
 'Francis Bach',
 'Alexander G',
 'Ambuj Tewari',
 'Charles Sutton',
 'Dale Schuurmans',
 'David Blei',
 'David M',
 'Devavrat Shah',
 'Dustin Tran']

### Converting to csv

In [21]:
# the lists of authors, abstracts and titles are written to a dataframe and then this dataframe is converted to a csv file
zippedList =  list(zip(top_10_abst_terms,top_10_title_terms,top_authors))
csv_df = pd.DataFrame(zippedList, columns = ['top10_terms_in_abstracts','top10_terms_in_titles','top10_authors'])
csv_df.to_csv('Group155_stats.csv', index = False)