# Format Spanish data to word lists
format Spanish in the following formats:

* all data text spanish: "word", "type", "english", "frequency_rank"
* 2 column: word, type, definiton

In [4]:
import pandas as pd
import os
import re

In [5]:
DATASET = 'spanish'
df = pd.read_pickle(f"./data/spanish.pkl")
df.head()

Unnamed: 0,word,type,english,frequency_rank
0,a,prep,"to, at",8
0,abajo,adv,"down, below, downward",788
0,abandonado,adj,abandoned,2896
0,abandonar,v,"to abandon, leave (a place)",680
0,abandono,nm,"abandonment, desertion",3463


In [22]:
df2 = pd.read_pickle(f"./data/spanish2.pkl")
df2.head()

Unnamed: 0,word,type,english,frequency_rank,spanish
0,el,art,the dictionary also had useful phrases,1,el diccionario tena tambin frases tiles
0,de,prep,he is the son of a friend of mine,2,es el hijo de un amigo mo
0,que,conj,he says that he doesnt want to study,3,dice que no quiere estudiar
0,y,conj,they know how to read and write,4,saben leer y escribir
0,en,prep,I live on the second floor,5,vivo en el segundo piso


In [25]:
# "word": word, "type": type, "english": english, "frequency_rank": frequency_rank
def load_data():
    data = df[["word", "type", "english", "frequency_rank"]]
    data = data.rename(columns={'english' : 'English'})
    data = data.rename(columns={'frequency_rank' : 'frequency rank'})
    return data

# "word": word, "type": type, "english": english, "frequency_rank": frequency_rank, "spanish": spanish
def load_data2():
    data = df2[["word", "type", "spanish", "english", "frequency_rank"]]
    data = data.rename(columns={'english' : 'English'})
    data = data.rename(columns={'spanish' : 'Spanish'})
    data = data.rename(columns={'frequency_rank' : 'frequency rank'})
    return data

In [10]:
# Replaces given word in field with '_'
def replace_word_in_field_with_underscore(word, field):
    field_split = field.split(' ')
    def _replace(e):
        if word not in e:
           return e
        if not re.match(f"^{word}.*?$", e):
            return e
        return e.replace(word, '_')
    field_split_replaced = list(map(lambda e: _replace(e), field_split))
    return ' '.join(field_split_replaced)

## HTML+PDF all columns alphabetical

In [9]:
# Complete to HTML
data = load_data2()
data["Spanish"] = data.apply(lambda row: replace_word_in_field_with_underscore(row.word, row.Spanish) , axis=1)
data.sort_values('word') # alphebetical

style = data.style.format(
    escape="html",
    )
style = style.hide(axis='index')

html = style.to_html()
filename = DATASET + '_alphabetical2'
with open(f'output/{filename}.html', 'w') as f:
    f.write(html)

cmd = f'pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=2 -V margin-bottom=2 -V margin-left=2 -V margin-right=2 -c format/table.css --pdf-engine-opt=--enable-local-file-access'
os.system(cmd)

Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          


0

## HTML+PDF all columns grouped by CEFR

In [27]:
# By Ranking / pseudo-cefr
data = load_data2()
data["Spanish"] = data.apply(lambda row: replace_word_in_field_with_underscore(row.word, row.Spanish) , axis=1)
data.head()

data_by_cefr = [
    data.iloc[:1000],
    data.iloc[1000:2000],
    data.iloc[2000:3000],
    data.iloc[3000:4000],
    data.iloc[4000:],
    ]


data_by_cefr[1].head()
cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']

# Complete to HTML
html_out = ''
for i, data_slice in enumerate(data_by_cefr):
    if data_slice.empty:
        continue
    #cefr = data_slice['cefr'].iloc[0]
    cefr = cefrs[i]
    html_out += f'<h2>{cefr}</h2>'
    #data_slice = data_slice.drop(['frequency rank'], axis=1)
    data_slice = data_slice.rename(columns={'word' : f'word ({cefr})'})
    #data_slice = data_slice.rename(columns={'english' : 'English'})
    #data_slice = data_slice.rename(columns={'spanish' : 'Spanish'})

    style = data_slice.style.format(
        escape="html",
        )
    style = style.hide(axis='index')
    html_out += style.to_html()


filename = DATASET+'_underscore_by_cefr'
with open(f'output/{filename}.html', 'w', encoding='utf-8') as f:
    f.write(html_out)

# to pdf
cmd = f"""pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=2 -V margin-bottom=2 -V margin-left=2 -V margin-right=2 -c format/table.css --pdf-engine-opt=--enable-local-file-access --title '{filename}'"""
os.system(cmd)


Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                          


0

In [None]:
#data = load_data()
#cefrs = ['A1', 'A2', 'B1', 'B2', 'C1']
#data_by_cefr = list(map(lambda c : data[data['cefr'] == c], cefrs))
#
## Complete to HTML
#html_out = ''
#for data in data_by_cefr:
#    if data.empty:
#        continue
#    cefr = data['cefr'].iloc[0]
#    html_out += f'<h2>{cefr}</h2>'
#    data = data.drop(['cefr'], axis=1)
#    print()
#    data = data.rename(columns={'word' : f'word ({cefr})'})
#
#    style = data.style.format(
#        escape="html",
#        )
#    style = style.hide(axis='index')
#    html_out += style.to_html()
#
#
#filename = DATASET+'_by_cefr'
#with open(f'output/{filename}.html', 'w', encoding='utf-8') as f:
#    f.write(html_out)
#
## to pdf
#cmd = f"""pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=2 -V margin-bottom=2 -V margin-left=2 -V margin-right=2 -c format/table.css --pdf-engine-opt=--enable-local-file-access --title '{filename}'"""
#os.system(cmd)


## HTML+PDF all columns grouped by CEFR shuffle

In [None]:
## Complete to HTML
#html_out = ''
#for data in data_by_cefr:
#    if data.empty:
#        continue
#    data = load_data()
#    cefr = data['cefr'].iloc[0]
#    html_out += f'<h2>{cefr}</h2>'
#    data = data.drop(['cefr'], axis=1)
#    print()
#    data = data.rename(columns={'word' : f'word ({cefr})'})
#    data = data.sample(frac=1)
#
#    style = data.style.format(
#        escape="html",
#        )
#    style = style.hide(axis='index')
#    html_out += style.to_html()
#
#
#filename = DATASET+'_by_cefr_shuffle'
#with open(f'output/{filename}.html', 'w', encoding='utf-8') as f:
#    f.write(html_out)
#
## to pdf
#cmd = f"""pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=2 -V margin-bottom=2 -V margin-left=2 -V margin-right=2 -c format/table.css --pdf-engine-opt=--enable-local-file-access --title '{filename}'"""
#os.system(cmd)
#

In [None]:
# Complete to HTML
html_out = ''
for data in data_by_cefr:
    if data.empty:
        continue
    data = load_data()
    cefr = data['cefr'].iloc[0]
    html_out += f'<h2>{cefr}</h2>'
    data = data.drop(['cefr'], axis=1)
    print()
    data = data.rename(columns={'word' : f'word ({cefr})'})
    data = data.sample(frac=1)

    style = data.style.format(
        escape="html",
        )
    style = style.hide(axis='index')
    html_out += style.to_html()


filename = DATASET+'_by_cefr_shuffle'
with open(f'output/{filename}.html', 'w', encoding='utf-8') as f:
    f.write(html_out)

# to pdf
cmd = f"""pandoc -f html -t pdf output/{filename}.html -t html5 -o output/{filename}.pdf --metadata pagetitle="{filename}" -V margin-top=2 -V margin-bottom=2 -V margin-left=2 -V margin-right=2 -c format/table.css --pdf-engine-opt=--enable-local-file-access --title '{filename}'"""
os.system(cmd)


## 2 Column LateX word, type and definition

In [None]:
# 2 column 5000 not by rank
# shuffle and alphabetical

import re
# Fix supertabular and add \textit to type
def fix_latex_line(line):
    if re.match(r"^\\begin{supertabular}", line):
        # Add column_format to supertabular}
        return '\\begin{supertabular}'+'{'+column_format+'}'
    if re.match(r"^\\.*{tabular}", line):
        # Remove {tabular}
        return ''
    if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
        # Italics
        return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
    return line
# columns = ["word", "type", "english", "frequency_rank"]
data = load_data()
data["word"] = data.apply(lambda row: f"{row.word.strip()} ({row.type.strip()})" , axis=1)
data = data[["word", "English"]]

style = data.style.format(
    escape="latex",
    )
style = style.hide(axis='index')
style = style.hide(axis='columns')

column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'

latex = style.to_latex(
    environment='supertabular',
    column_format=column_format
)

latex_lines = latex.splitlines()

latex = '\n'.join((map(fix_latex_line, latex_lines)))

with open('build/spanish_5000_two_column_alphabetical.tex', 'w') as f:
    f.write(latex)

cmd_build = "latexmk -pdf -cd format/spanish_5000_two_column_alphabetical.tex -outdir=../output"
os.system(cmd_build)

#### Same for shuffle

data = data.sample(frac=1) # shuffle
style = data.style.format(
    escape="latex",
    )
style = style.hide(axis='index')
style = style.hide(axis='columns')

column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'

latex = style.to_latex(
    environment='supertabular',
    column_format=column_format
)

latex_lines = latex.splitlines()
latex = '\n'.join((map(fix_latex_line, latex_lines)))

with open('build/spanish_5000_two_column_shuffle.tex', 'w') as f:
    f.write(latex)

cmd_build = "latexmk -pdf -cd format/spanish_5000_two_column_shuffle.tex -outdir=../output"
os.system(cmd_build)

In [None]:
# By Ranking / pseudo-cefr
data = load_data()
data.head()
#cefrs = [1000, 2000, 3000, 4000, 5000] # Data goes up to 5010, always above 0
#def is_frequency_in_freq_bound(freq, upper_bound):
#    lower_bound = upper_bound - 1000
#    print(freq)
#    freq_int = int(freq)
#    return freq if lower_bound <= freq_int and freq_int < upper_bound else False
#data_by_cefr = list(map(lambda f : data[is_frequency_in_freq_bound(data['frequency rank'], f)], cefrs))

# %%
#data_by_cefr[1].head()

In [None]:
import re
# Fix supertabular and add \textit to type
def fix_latex_line(line):
    if re.match(r"^\\begin{supertabular}", line):
        # Add column_format to supertabular}
        return '\\begin{supertabular}'+'{'+column_format+'}'
    if re.match(r"^\\.*{tabular}", line):
        # Remove {tabular}
        return ''
    if re.match(r"^\w+\s.*\(\w+\s?\w+?\)", line):
        # Italics
        return re.sub(r"(^\w+\s.*)(\(\w+\s?\w+?\))", r"\1\\textit{\2}", line)
    return line

In [None]:
for data, cefr in zip(data_by_cefr, cefrs):
    if data.empty:
        continue
    data = data[["word", "definition", "type", "cefr"]]
    data["word"] = data.apply(lambda row: f"{row.word.strip()} ({row.type.strip()})" , axis=1)
    data = data[["word", "definition"]]

    style = data.style.format(
        escape="latex",
        )
    style = style.hide(axis='index')
    style = style.hide(axis='columns')

    column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
    latex = style.to_latex(
        environment='supertabular',
        column_format=column_format
    )

    latex_lines = latex.splitlines()

    latex = '\n'.join((map(fix_latex_line, latex_lines)))

    filename = f'{DATASET}_{cefr}'
    with open(f'build/{filename}.tex', 'w') as f:
        f.write(latex)

## 2 Column LateX word,type and definition by CEFR shuffle

In [None]:
for data, cefr in zip(data_by_cefr, cefrs):
    if data.empty:
        continue
    data = data[["word", "definition", "type", "cefr"]]
    data["word"] = data.apply(lambda row: f"{row.word.strip()} ({row.type.strip()})" , axis=1)
    data = data[["word", "definition"]]

    data = data.sample(frac = 1)

    style = data.style.format(
        escape="latex",
        )
    style = style.hide(axis='index')
    style = style.hide(axis='columns')

    column_format = 'p{1.2in}p{2.3in}p{1.2in}p{2.3in}'
    latex = style.to_latex(
        environment='supertabular',
        column_format=column_format
    )

    latex_lines = latex.splitlines()
    latex = '\n'.join((map(fix_latex_line, latex_lines)))

    filename = f'{DATASET}_shuffle_{cefr}'
    with open(f'build/{filename}.tex', 'w') as f:
        f.write(latex)

In [None]:
# Run terminal cd format latexmk oxford*.tex to finish build pdfs
cmd_build = f"latexmk -pdf -cd format/{DATASET}*.tex -outdir=../output" 
os.system(cmd_build)

In [None]:
# Clean any build files in output/
files_to_remove = ["*.fls", "*.log", "*.html", "*.toc", "*.synctex*", "*.fdb*", "*.aux"]
cmd_cleanup_output = "rm "+" ".join(f"output/{f}" for f in files_to_remove)
cmd_cleanup_format = "rm "+" ".join(f"format/{f}" for f in files_to_remove+["*.pdf"])
cmd_cleanup_main = "rm "+" ".join(f"./{f}" for f in files_to_remove)
os.system(";".join((cmd_cleanup_format, cmd_cleanup_main, cmd_cleanup_output)))