# Problem 2: Scraping, Entropy and ICML papers.

Scraping all pdfs from the link - http://proceedings.mlr.press/v70/ with BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup
import io
from PyPDF2 import PdfFileReader

In [2]:
url = "http://proceedings.mlr.press/v70/"
read = requests.get(url)
html_content = read.content
soup = BeautifulSoup(html_content, "html.parser")

In [3]:
#Getting a list of all the pdfs from the link above
list_of_pdf = set()
for i in soup.find_all('a'):
    pdf_list = (i.get('href'))
    if "http" not in pdf_list or ".pdf" not in pdf_list:
        continue
    #print(pdf_list)
    list_of_pdf.add(pdf_list)
    
    

In [4]:
def info(pdf_path):
    # used get method to get the pdf file
    response = requests.get(pdf_path)

    # response.content generate binary code for
    # string function
    with io.BytesIO(response.content) as f:

        # initialized the pdf
        try:
            pdf = PdfFileReader(f, strict=False)
        except Exception:
            return ""
        # all info about pdf
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        content = ""
        for page_number in range(number_of_pages): 
            page = pdf.pages[page_number]
            content = content + page.extract_text()
        #print(page.extract_text())

    return content


In [6]:
# Copying all the contents of these pdf files into pdf_content dict
j = 0
pdf_content = {}
for i in list_of_pdf:
    pdf_content[j] = info(i) 
    j = j+1
    

Multiple definitions in dictionary at byte 0x1836 for key /ExtGState
Multiple definitions in dictionary at byte 0x220c for key /ExtGState
Multiple definitions in dictionary at byte 0x1938 for key /ExtGState
Multiple definitions in dictionary at byte 0x1eda for key /ExtGState
Multiple definitions in dictionary at byte 0x275d for key /ExtGState
Multiple definitions in dictionary at byte 0x53b for key /ExtGState
Multiple definitions in dictionary at byte 0x1203 for key /ExtGState
Multiple definitions in dictionary at byte 0x1c43 for key /ExtGState
Multiple definitions in dictionary at byte 0x418 for key /ExtGState
Multiple definitions in dictionary at byte 0x89f for key /ExtGState
Multiple definitions in dictionary at byte 0xf3b for key /ExtGState
Multiple definitions in dictionary at byte 0x1642 for key /ExtGState
Multiple definitions in dictionary at byte 0x1b60 for key /ExtGState
Multiple definitions in dictionary at byte 0x430 for key /ExtGState
Multiple definitions in dictionary at b

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [7]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

#Converting this pdf_content into a dataframe to perform some data cleaning
pdf = pd.DataFrame(pdf_content.items(), columns=['id','content'])
pdf['content'] = pdf['content'].str.replace('[{}]'.format(string.punctuation), '')
pdf['content'] = pdf['content'].str.encode("ascii", "ignore")
pdf['content'] = pdf['content'].str.decode("ascii")
pdf['content'] = pdf['content'].str.encode("utf-8", "ignore")
pdf['content'] = pdf['content'].str.decode("utf-8")
pdf = pdf.replace('\n',' ', regex=True)
pdf = pdf.replace('\d+',' ', regex=True)


  pdf['content'] = pdf['content'].str.replace('[{}]'.format(string.punctuation), '')


In [8]:
pdf.head(2)

Unnamed: 0,id,content
0,0,Failures of GradientBased Deep Learning Shai S...
1,1,Meta Networks A Training Details To train and ...


In [9]:
#Writing this pdf content into a text file 
with open("pdf_content.txt", 'a') as f:
    dfAsString = pdf.to_string(header=False, index=False)
    f.write(dfAsString)

In [10]:
# Reading the text file back to perform some more pre-processing
import nltk
nltk.download('punkt')
raw = open('pdf_content.txt').read()


[nltk_data] Downloading package punkt to /Users/ri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
from nltk import pos_tag, word_tokenize, wordpunct_tokenize
nltk.download('words')
from nltk.corpus import words

# Removing non english words from the pdfs
def removeNonEnglishWordsFunct(x):
    words = set(nltk.corpus.words.words())
    filteredSentence = " ".join(w for w in nltk.word_tokenize(x) \
                                if w.lower() in words)
    return filteredSentence

fresh_data = removeNonEnglishWordsFunct(raw)

[nltk_data] Downloading package words to /Users/ri/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [12]:
# Writing the content back to a sample file, which will be used further for all calculations
text_file = open("sample.txt", "wt")
n = text_file.write(fresh_data)
text_file.close()

# 1. What are the top 10 common words in the ICML papers?

In [13]:
from nltk.probability import FreqDist

raw = open('sample.txt').read()

tokens = word_tokenize(raw)
words = [w.lower() for w in tokens]

# Finding the frequency of all words
fdist = FreqDist(words)
word_freq_df = pd.DataFrame(fdist.items(), columns=['word', 'frequency'])

In [14]:
# Printing the top 10 most common words - stopwords are not removed
word_freq_df.sort_values(by='frequency', ascending=False).head(10)

Unnamed: 0,word,frequency
0,the,379536
3,of,184913
5,and,137716
47,a,131942
16,in,125415
34,to,115934
39,we,93804
46,for,87577
13,is,81574
18,that,65311


# 2. Let Zbe a randomly selected word in a randomly selected ICML paper. Estimate the entropy of Z.

In [15]:
# Entropy or H is the minimum no of bits that are needed to be sent across to derive all the other bits.
# H = - sum ((probability of i) * log (probability of i))
import math
# total number of words
total_words = word_freq_df["frequency"].sum()
word_freq_df["per_word"] = word_freq_df["frequency"]/total_words

word_freq_df["log_entropy"] = word_freq_df["per_word"].map(lambda x: math.log2(x))
word_freq_df["entropy"] = word_freq_df["per_word"] * word_freq_df["log_entropy"]

In [16]:
total_entropy = -1 * word_freq_df["entropy"].sum()
# Priting the entropy value
print("The entropy value of Z is ", total_entropy)

The entropy value of Z is  8.897936910106326


# 3. Synthesize a random paragraph using the marginal distribution over words.

In [17]:
para = np.random.choice(word_freq_df["word"], 100, p=word_freq_df["per_word"])
full_para = ' '.join([str(elem) for elem in para])
full_para

'oh and acceptor a competitive m of min a is speech are e complete end to physics s is a network that we a such obtain anomalous on of also van whose completion the and if depending as result the h their update discussion be amenable the effective we the the section this related setting to time we when model condition such root objective do it more that that purpose this can equilibria optimal u by used several the is problem the and fashion we practice the section re operator gradient general rule also despite optimization al chosen the test'

# 4. (Extra credit) Synthesize a random paragraph using an n-gram model on words. Synthesize a random paragraph using any model you want.

In [18]:
text_file = open ("sample.txt","r",encoding="utf-8")
pdf_text = text_file.read()
text_file.close()

In [19]:
#Creating the ngram dictionary
ngrams = {}
#Our n is 3 here
pair_words = 3

# Tokenize all the words in the sample
words_tokens = nltk.word_tokenize(pdf_text)

# 3 words are used as key and value consists of all words that are most likely to appear next
for i in range(len(words_tokens)-pair_words):
    seq = ' '.join(words_tokens[i:i+pair_words])
    if  seq not in ngrams.keys():
        ngrams[seq] = []
    ngrams[seq].append(words_tokens[i+pair_words])


In [26]:
import random

# Randomly generate a Paragraph by taking random words as first pair
number= random.randint(1,len(words_tokens)-pair_words)

first_pair = ' '.join(words_tokens[number:number+pair_words])
para = first_pair

for i in range(150):
    # Check if this pair is a key
    if first_pair not in ngrams.keys():
        break
    #If yes then take the value of this key
    list_of_words = ngrams[first_pair]
    # Randomly pick the next words
    nxt_words = list_of_words[random.randrange(len(list_of_words))]
    para += ' ' + nxt_words
    seq_words = nltk.word_tokenize(para)
    # Repeat this process with a new set of random pairs
    first_pair = ' '.join(seq_words[len(seq_words)-pair_words:len(seq_words)])

print(para,"\n")

Hence we have G V for any s According to Algorithm we immediately know we will not query again in that region of the image This set of convex to data Indeed our learning prob can be further by increasing the number of training The location and size of the sketch the min risk and in Section we use the optimization algorithm deterministic However a similar analysis to above min min and min fork respectively In addition we also show approximation for Time min Perplexity Full Sampling Ours Table Perplexity after for different as d d d a n Number of Sample complexity Figure Comparison of over time Reasoning Based on Structural Dependency The hidden layer for A three dissimilar classes t n and s m j and at p p n which that uniformly distributed respond to small We note here for future use that for strongly convex prob Input Initial point 

