# Format HTML Text to LaTeX
    

In [6]:
from lxml import html
#from urllib.request import Request, urlopen
import requests, bs4, re

### Load URL of Foundational Research Institute Blog Post and convert content to bs4

In [30]:
url = "https://foundational-research.org/artificial-intelligence-and-its-implications-for-future-suffering"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
page = requests.get(url, headers=headers)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, 'lxml')

In [45]:
count = 1
for a in soup.find_all('a', href=True):
    print("Found the URL:", a['href'])
    count += 1
count

Found the URL: https://foundational-research.org
Found the URL: #
Found the URL: https://foundational-research.org/our-mission/
Found the URL: https://foundational-research.org/the-case-for-suffering-focused-ethics/
Found the URL: #
Found the URL: /research
Found the URL: /research#future-suffering
Found the URL: /research#cooperation
Found the URL: /research#ethics
Found the URL: /research#consciousness
Found the URL: #
Found the URL: https://foundational-research.org/work-with-us/
Found the URL: https://foundational-research.org/volunteer/
Found the URL: https://foundational-research.org/open-research-questions/
Found the URL: https://foundational-research.org/team/
Found the URL: https://foundational-research.org/donate/
Found the URL: https://www.facebook.com/FoundationalResearch
Found the URL: https://foundational-research.org/author/brian-tomasik/
Found the URL: https://foundational-research.org/files/RobotsAI.mp3
Found the URL: #introduction
Found the URL: #is-the-singularity-cr

341

### 1. Extract the Blog post ID from the website via Right click -> Inspect -> Copy -> Copy selector in Chrome

### 2. Update ID number (257 in example below) in `writeSoupToFile` and enter a `filename` -> content will be written into that file
    `#post-275 > div > div:nth-child(1) > table > tbody`

In [5]:
def writeSoupToFile(filename, selector):
    content = soup.select(selector)
    content = [tag.prettify('utf-8') for tag in content]#[i.prettify('utf-8') for i in soup.findAll(['p','li', 'h2'])]
    with open(filename, 'wb') as f:
        for tag in content:
            f.write(tag)
            
def writeToFile(filename, listOfStr):
    with open(filename, 'wb') as f:
        for item in listOfStr:
            f.write(item.encode('utf-8'))
            

#### Save HTML content to file

In [33]:
filename = 'AIthoughts'

In [34]:
### post-2869 > div > div:nth-child(1) > blockquote:nth-child(9) > p
writeSoupToFile(filename + '.txt', '#post-33 > div > div')

#### Format HTML content to LaTeX and write result to file. Remove ToC manually first.

In [37]:
formatToLatex(filename+'LaTeX.txt', readFile(filename+'.txt'))

In [38]:
writeToFile(filename+'LaTeX2.txt', formatMath(readFile(filename+'.txt')))

# Code Sections

In [1082]:
# content = page.content.decode()

### Remove ToC manually before reading the file back in. Couldn't find a solution how to remove it via RegEx.

In [39]:
def readFile(filename):
    '''
    Reads a file and returns its content
    @Param filename that needs to be read
    @return a list with file content
    '''
    with open(filename, 'r', encoding='utf-8') as f:
        return f.readlines()       

### Formatting, cleaning and stripping functions

    TODO: Automate Footnotes

In [15]:
def formatSections(listOfStr):
    '''
    Takes a list of Strings, extracts HTML sections
    and formats them to LaTeX `\section{}`
    @Param List of strings
    @return HTML `<h2>``</h2>` tags replaced by LaTeX `\subsection{``}`
    '''
    temp = [re.sub(r'\s+<span.*>\n', r'', i) for i in listOfStr]
    temp = [re.sub(r'\s+</span>\n', r'', i) for i in temp]
    temp = [re.sub(r'\s+</h2>\n', r'}', i) for i in temp]
    return [re.sub(r'\s+<h2.*>[.\s\n]*[\n\s]*', r'\section{', i) for i in temp]

def formatSubsections(listOfStr):
    '''
    Takes a list of Strings, extracts HTML subsections
    and formats them to LaTeX `\subsection{}`
    @Param List of strings
    @return HTML <h3></h3> tags replaced by LaTeX `\subsection{` `}`
    '''
    temp = [re.sub(r'\s+</h3>\n', r'}', i) for i in listOfStr]
    temp = [re.sub(r'\s+<h3.*>[.\s\n]*[\n\s]*', r'\subsection{', i) for i in temp]
    temp = [re.sub(r'\s+<h4.*>[.\s\n]*[\n\s]*', r'\subsubsection{', i) for i in temp]
    return [re.sub(r'\s+</h4>\n', r'}', i) for i in temp]

    
def formatLinks(listOfStr):
    '''
    Takes a list of Strings, extracts HTML Hyperlinktags
    and formats them to LaTeX `\href{}{}`
    @Param List of strings
    @return HTML `<a href="urlname">[text]</a>` tags replaced by LaTeX `\href{urlname}{text}`
    '''
    # Remember: links in LaTeX are of the style "\href{url}{text}"
    # replace "<a [more useless text here] href=" with "\href{" 
    cut_head = [re.sub(r'[<a\s.]+\s+href="', r'\\href{', line) for line in listOfStr]
    
    # close the url-braces and open the text-braces
    cut_tail = [re.sub(r'".*>', r'}{', head) for head in cut_head]
    
    # finally close the text-braces and return result
    return  [re.sub(r'</a>\n', r'}', a) for a in cut_tail]

def formatParagraphs(listOfStr):
    return [re.sub(r'<p.*>|<\/p>',r'',line) for line in listOfStr]

def formatItalics(listOfStr):
    '''
    Takes a list of Strings, extracts HTML italics tags
    and formats them to LaTeX `\textit{}`
    @Param List of strings
    @return HTML `<i></i>` and `<em></em>` tags replaced by LaTeX `\textit{}`
    '''
    head = [re.sub(r'<i>|<em>', r'\\textit{', line) for line in listOfStr]
    ## edit: add Bold text
    head = [re.sub(r'<b>', r'\\textbf{', line) for line in head]
    head = [re.sub(r'</b>', r'} ',i) for i in head]
    return [re.sub(r'</i>|</em>', r'} ',i) for i in head]

def formatLists(listOfStr):
    #ordered list
    temp = [re.sub(r'\s+<ol>\n',r'\\begin{enumerate}\n', i) for i in listOfStr]
    temp = [re.sub(r'\s+</ol>\n',r'\\end{enumerate}\n\\noindent', i) for i in temp]
    
    #unordered list
    temp = [re.sub(r'\s+<ul>\n',r'\\begin{itemize}\n', i) for i in temp]
    temp = [re.sub(r'\s+</ul>\n',r'\\end{itemize}\n\\noindent', i) for i in temp]
    
    temp = [re.sub(r'\s+</li>', r'', i) for i in temp]
    return [re.sub(r'\s+<li>', r'\\item ', i) for i in temp]
    
def formatBlockquotes(listOfStr):
    temp = [re.sub(r'\s*<blockquote>\n|\s*<q>\n', r'\\begin{quote}\n', i) for i in listOfStr]
    return [re.sub(r'\s*</blockquote>\n|\s*</q>\n', r'\\end{quote}\n\\noindent', i) for i in temp]

def formatFootnotes(listOfStr):
    return [re.sub(r'\s*\\href{#link_ajs-fn-id_\d+-2699}{', r'\\footnote{ENTER FOOTNOTE HERE}', i) for i in listOfStr]

def formatUnderline(listOfStr):
    temp = [re.sub(r'<u>',r'\\underline{', i) for i in listOfStr]
    return [re.sub(r'</u>',r'}', i) for i in temp]

def formatMath(listOfStr):
    
    temp = [re.sub(r'<sup>',r'\\textsuperscript{', i) for i in listOfStr]
    temp = [re.sub(r'</sup>',r'} ', i) for i in temp]
    temp = [re.sub(r'<sub>',r'\\textsubscript{', i) for i in temp]
    
    return [re.sub(r'</sub>',r'} ', i) for i in temp]




def Clean(listOfStr):
    temp = [re.sub(r'\s+<a id=}{', r'', i) for i in listOfStr]
    temp = [re.sub(r'\s*<img align=}{', r'', i) for i in temp]
    temp = [re.sub(r'&amp;', r'\\&', i) for i in temp]
    temp = [re.sub(r'&amp;', r'\\&', i) for i in temp]
    temp = [re.sub(r'\$', r'\\$', i) for i in temp]
    temp = [re.sub(r'%', r'\%',i) for i in temp]
    temp = [re.sub(r'_', r'\_',i) for i in temp]
    temp = [re.sub(r'#', r'\#',i) for i in temp]
    temp = [re.sub(r'<!--', r" ", i) for i in temp]
    temp = [re.sub(r'-->', r' ', i) for i in temp]
    temp = [re.sub(r'<br/>', r'', i) for i in temp]
    formatted = [re.sub(r'\s+<a name=}{', r'', i) for i in temp]
    
    return writeToFile('formatted.txt', formatted)
    

def Strip():
    read = readFile('formatted.txt')
    temp = [re.sub(r'{\s+', r'{', i.strip(' ')) for i in read]
    temp = [re.sub(r'}\s+\n', r'} ',i) for i in temp]
    
    return "\n".join([re.match(r'(.*)', line).group() for line in temp])

def FixWhitespace(filename, content):
	temp = re.sub(r'\n{4}', r'REMOVE', content)
	temp = re.sub(r'\n{2,3}', r'NEWLINE', temp)
	temp = re.sub(r'\n\\href', r' \\href', temp)
	temp = re.sub(r'\n\\textit', r' \\textit', temp)
	temp = re.sub(r'\n\\s', r'\\s', temp)
	temp = re.sub(r'\n', r'', temp)
	temp = re.sub(r'NEWLINE|REMOVE}', r'\n\n', temp)
	temp = re.sub(r'}\s+,', r'},', temp)
	temp = re.sub(r'}\s+}', r'}}', temp)
	temp = re.sub(r'}\s+\.', r'}.',temp)
	temp = re.sub(r'}\s+:', r'}:',temp)
	temp = re.sub(r'{\s+\\', r'{\\', temp)
	temp = re.sub(r'\s+\)', r')', temp)
	temp = re.sub(r'}\s+\?', r'}?', temp)
	temp = re.sub(r'\(\s+\\', r'(\\', temp)
	temp = re.sub(r'"\s+\\', r'"\\', temp) 
	temp = re.sub(r'}\s+"', r'}"', temp) # link "[...]" -> link"[...] !!! fix manually
	
	return writeToFile(filename, temp)

def formatToLatex(filename, listOfStr):
    temp = formatSections(listOfStr)
    temp = formatSubsections(temp)
    temp = formatParagraphs(temp)
    temp = formatLinks(temp)
    temp = formatLists(temp)
    temp = formatBlockquotes(temp)
    temp = formatItalics(temp)
    temp = formatFootnotes(temp)
    temp = formatMath(temp)
    temp = Clean(temp)
    
    return FixWhitespace(filename, Strip())

In [4]:
def formatBolds(listOfStr):
     ## edit: add Bold text
    head = [re.sub(r'<b>', r'\\textbf{', line) for line in listOfStr]
    return [re.sub(r'</b>', r'} ',i) for i in head]

### Wordpress References to LaTeX

In [1030]:
url = "http://dev.foundational-research.org/the-asymmetry-and-extinction-thought-experiments/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
page = requests.get(url, headers=headers)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, 'lxml')

#### Extract Blog Post ID via Chrome as Explained Above: 
    `#post-263 > div > div:nth-child(1) > ul > li:nth-child(1)`

In [1068]:
#post-263 > div > div:nth-child(1) > ul > li:nth-child(1)
references = soup.select('#post-263 > div > div > ul > li')[1:]

html = [i.prettify('utf-8') for i in references]
filename = 'AsymRef.txt'
with open(filename, 'wb') as f:
    for i in html:
        f.write(i)

### Format HTML References to LaTeX via RegEx


In [1074]:
import re

box = []
with open(filename, 'r', encoding='utf-8') as f:
    box.append(f.read())

box = box[0].replace('\n', '')

tags = {'<ul>': '\\renewcommand{\\refname}{References}\\n\\addcontentsline{toc}{section}{References}\\n\\begin{thebibliography}{99}\\n\\raggedright % Fixes whitespace within reference \\n',
        '</ul>': '\\end{thebibliography}',
        '<sup>': '',
        '</sup>': '',
        '<li>': '\\bibitem{}\\n',
        '</li>': '\\n\\n',
        '<i>':'\\textit{',
        '</i>': '}',
        '<em>':'\\textit{',
        '</em>': '}'}

def specialLetters(listOfStr):
    temp = [re.sub(r'&amp;', r'\\&', i) for i in listOfStr]
    temp = [re.sub(r'&amp;', r'\\&', i) for i in temp]
    temp = [re.sub(r'\$', r'\\$', i) for i in temp]
    temp = [re.sub(r'%', r'\%',i) for i in temp]
    temp = [re.sub(r'#', r'\#',i) for i in temp]
    return [re.sub(r'\s+<a name=}{', r'', i) for i in temp]
    

def formatReferences(ref,tags):
    '''
    Function that takes a list of the HTML content which is supposed to be formatted and a dict
    that contains the tags and their replacement as a key,value pair.
    
    @Param: ref: List of strings, containing the HTML content that is supposed to be formatted
            tags: dict, containing tags as keys and replacement as value
    
    @return: returns the formatted list as pure string
    '''
    
    
    
    for key,val in tags.items():
        ref = ref.replace(key,val)
    #remove additional whitespace after an opening brace
    temp = re.sub(r'\\textit{\s+', r'\\textit{', ref) 
    
    #remove additional whitespace before a closing brace
    temp = re.sub(r'\s+}',r'}', temp) 
    
    #Fix whitespace issue before comma
    temp = re.sub(r'\s+,', r',',temp)  
    
    #Fix whitespace issue before comma
    temp = re.sub(r'\s+\.', r'.',temp)
    
    
    #ref = re.sub(r'[<a.+]+', r'',ref) #remove hyperlinks text at the end of the bibitem
    temp = temp.split('\\n')
    
    #Format hyperlinks
    temp = formatLinks(temp)
    temp = specialLetters(temp)
    
    
    
    return [print(i) for i in temp]


head = "\\renewcommand{\\refname}{References}\n\
    \\addcontentsline{toc}{section}{References}\
    \n\\begin{thebibliography}{99}\n\\raggedright\
    % Fixes whitespace within reference"
    
tail = "\\end{thebibliography}'"

print(head)
formatReferences(box,tags)
print(tail)

\renewcommand{\refname}{References}
    \addcontentsline{toc}{section}{References}    
\begin{thebibliography}{99}
\raggedright    % Fixes whitespace within reference
\bibitem{}
 Holtug, Nils. “Person-affecting Moralities.” In \textit{The Repugnant Conclusion}, edited by Jesper Ryberg and Torbjörn Tännsjö, 129–61. Dordrecht: Kluwer, 2004.

\bibitem{}
 Narveson, Jan. “Utilitarianism and New Generations.” \textit{Mind} 76 (1967): 62–72\href{http://www.colorado.edu/philosophy/heathwood/6100/Narveson\%20-\%20Utilitarianism\%20and\%20New\%20Generations.pdf}{.

\bibitem{}
 McMahan, Jefferson. “Problems of Population Policy.” \textit{Ethics} 92 (1981): 96–127.

\bibitem{}
 Meacham, Christopher J. G. “Person-affecting Views and Saturating Counterpart Relations.” \textit{Philosophical Studies} 158 (2012): 257–87\href{http://philpapers.org/archive/MEAPVA.pdf}{.

\bibitem{}
 Sikora, R. I. “Is it Wrong to Prevent the Existence of Future Generations?” In \textit{Obligations to Future Generations}, 

## Old Dummy Code. Ignore 

In [959]:
formatted = formatToLatex(content, t)
writeToFile('formatted.txt', formatted)

In [990]:
read = readFile('formatted.txt')
read = [re.sub(r'{\s+', r'{', i.strip(' ')) for i in read]
read = [re.sub(r'}\s+\n', r'} ',i) for i in read]
read = "\n".join([re.match(r'(.*)', line).group() for line in read])
read

'<div class=}{\n\nBased on a piece from 2006; major additions: Oct. 2013; last update: 14 Jan. 2016\n\n\\section{\\section{Summary\n}} \nIt\'s a classic debate among utilitarians: Should we care about an organism\'s happiness and suffering (hedonic wellbeing), or should we ultimately value fulfilling what it wants, whatever that may be (preferences)? In this piece, I discuss intuitions on both sides and explore a hybrid view that gives greater weight to the hedonic subsystems of brains than to other overriding subsystems. I also discuss how seeming infinite preferences against suffering could lead to a negative-leaning utilitarian perspective. While I have strong intuitions on both sides of the dispute, in the end I may side more with idealized-preference utilitarianism. But even if so, there remain many questions, such as Which entities count as agents? How should we weigh them? And how do we assess the relative strengths of their preferences? In using preference utilitarianism to res

In [991]:
test = re.sub(r'\n{4}', r'REMOVE', read)
test = re.sub(r'\n{2,3}', r'NEWLINE', test)
test = re.sub(r'\n\\href', r' \\href', test)
test = re.sub(r'\n\\textit', r' \\textit', test)
test = re.sub(r'\n\\s', r'\\s', test)
test = re.sub(r'\n', r'', test)
test = re.sub(r'NEWLINE|REMOVE}', r'\n\n', test)
test = re.sub(r'}\s+,', r'},', test)
test = re.sub(r'}\s+}', r'}}', test)
test = re.sub(r'}\s+\.', r'}.',test)
test = re.sub(r'}\s+:', r'}:',test)
test = re.sub(r'{\s+\\', r'{\\', test)
test = re.sub(r'\s+\)', r')', test)
test = re.sub(r'}\s+\?', r'}?', test)
test = re.sub(r'\(\s+\\', r'(\\', test)
test = re.sub(r'"\s+\\', r'"\\', test)
test = re.sub(r'}\s+"', r'}"', test)
test = test.replace('<!--', " ")
test = test.replace('-->', '')

#writeToFile('final.txt', test)
print(test)

<div class=}{

Based on a piece from 2006; major additions: Oct. 2013; last update: 14 Jan. 2016

\section{\section{Summary}} It's a classic debate among utilitarians: Should we care about an organism's happiness and suffering (hedonic wellbeing), or should we ultimately value fulfilling what it wants, whatever that may be (preferences)? In this piece, I discuss intuitions on both sides and explore a hybrid view that gives greater weight to the hedonic subsystems of brains than to other overriding subsystems. I also discuss how seeming infinite preferences against suffering could lead to a negative-leaning utilitarian perspective. While I have strong intuitions on both sides of the dispute, in the end I may side more with idealized-preference utilitarianism. But even if so, there remain many questions, such as Which entities count as agents? How should we weigh them? And how do we assess the relative strengths of their preferences? In using preference utilitarianism to resolve moral di

In [None]:
\footnote{I came up with the "all hands on deck" simile in 2014, and then a few months later, I discovered that Daniel Dennett proffered the same simile in \textit{Consciousness Explained}.}

\footnote{I also have sympathy with the opposite view: Yes, the older self does violate the preference of the younger self. A new "ruling coalition" in the person's brain \href{https://www.facebook.com/yudkowsky/posts/10152908949454228}{has overridden} the old one, just as strong groups of humans sometimes mercilessly destroy weak ones.}