In [107]:
import warnings
warnings.filterwarnings("ignore")
import os
import requests
from IPython.display import Markdown, display

# Meta language detection model 
import fasttext
from huggingface_hub import hf_hub_download


# Transformers datasets module
from datasets import load_dataset
from datasets import Dataset
from datasets import concatenate_datasets

import torch
if torch.backends.mps.is_available():
    print(f"GPU: True")

GPU: True


# 1. Sourcing dataset for pre-training

In this notebook, we will be using two ways to source dataset for training:

1. Downloading an existing dataset from huggingface datasets.
2. Create a dataset of python scripts sourced from github. 


In both the cases, the result will be a Hugging Face dataset object.

In [108]:
# The Red Pajama dataset is of about 1.2 Trillion Tokens we are not going to load the entire dataset into memory 
# So, we will be loading the dataset in streaming mode (doesn't download the full data immediately)

pretraining_dataset = load_dataset("math-ai/AutoMathText", 
                                    streaming = True, 
                                    split= "train")

# Taking first 100k samples from the dataset
pretraining_dataset = list(pretraining_dataset.take(1000))

# Sample dataset
pretraining_dataset = Dataset.from_list(pretraining_dataset)

Downloading readme: 0.00B [00:00, ?B/s]

In [109]:
print("Columns in pretraining dataset:", pretraining_dataset.column_names)

# We only need the text column
pretraining_dataset = pretraining_dataset.select_columns(["text"])

Columns in pretraining dataset: ['url', 'text', 'date', 'meta']


In [110]:
pretraining_dataset[0:5]

{'text': ['Commutative Property Of Addition 2. If A is an n×m matrix and O is a m×k zero-matrix, then we have: AO = O Note that AO is the n×k zero-matrix. Matrix Matrix Multiplication 11:09. We have 1. To understand the properties of transpose matrix, we will take two matrices A and B which have equal order. The identity matrix is a square matrix that has 1’s along the main diagonal and 0’s for all other entries. In a triangular matrix, the determinant is equal to the product of the diagonal elements. This matrix is often written simply as $$I$$, and is special in that it acts like 1 in matrix multiplication. Is the Inverse Property of Matrix Addition similar to the Inverse Property of Addition? The identity matrices (which are the square matrices whose entries are zero outside of the main diagonal and 1 on the main diagonal) are identity elements of the matrix product. Learning Objectives. In fact, this tutorial uses the Inverse Property of Addition and shows how it can be expanded to

## 1.1. Scraping the Math Python code from the web

In [111]:
# Path to directory to store the python scripts
code_dir = "./scripts_data"

In [112]:
urls = ["https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/and_gate.py",
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/imply_gate.py",
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/karnaugh_map_simplification.py",
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/multiplexer.py",
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/nand_gate.py", 
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/nor_gate.py", 
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/not_gate.py", 
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/or_gate.py", 
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/quine_mc_cluskey.py", 
"https://github.com/TheAlgorithms/Python/blob/master/boolean_algebra/xnor_gate.py", 
]

# Retrieve the python scripts from the urls
def retrieve_python_scripts(urls):
    scripts = []
    for url in urls:
        response = requests.get(url)
        filename = os.path.basename(url)
        file_path = os.path.join(code_dir, filename)

        with open(file_path, "wb") as file:
            file.write(response.content)
    
    return True

retrieve_python_scripts(urls)

True

In [113]:
# Lets concatenate the scripts from the list
code_dataset = []
for file in os.listdir(code_dir):
    code_dataset.append(
        {"text": open(os.path.join(code_dir, file), 'r').read()}
    )

#Creating a huggingface dataset object
code_dataset = Dataset.from_list(code_dataset) 
print(code_dataset)

Dataset({
    features: ['text'],
    num_rows: 10
})


## 1.2. Combine the python code dataset with the pretraining dataset


In [114]:
dataset = concatenate_datasets(
    [pretraining_dataset, code_dataset]
)

print(dataset)
# We can see the number of records are 1010

Dataset({
    features: ['text'],
    num_rows: 1010
})


# 2. Data Cleaning

1. Filter out samples that are too short
2. Remove repetitions within a single text example
3. Remove duplicated documents
4. Quality filter to remove non-English texts

In [115]:
import heapq

def paragraph_length_filter(x):
    """Returns False iff a page has too few lines or lines are too short."""
    lines = x['text'].split('\n')
    if (
        len(lines) < 3
        or min(heapq.nlargest(3, [len(line) for line in lines])) < 3
    ):
        return False
    return True

In [116]:
# We have filtered out the documents that are too short or have few lines
dataset = pretraining_dataset.filter(
    paragraph_length_filter, 
    load_from_cache_file = False
)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [117]:
dataset

Dataset({
    features: ['text'],
    num_rows: 891
})

In [118]:
display(Markdown(dataset[0]["text"]))

# Comparing the magnitudes of expressions of surds

I recently tackled some questions on maths-challenge / maths-aptitude papers where the task was to order various expressions made up of surds (without a calculator, obviously).

I found myself wondering whether I was relying too much on knowing the numerical value of some common surds, when a more robust method was available (and would work in more difficult cases).

For example, one question asked which is the largest of:

(a) $\sqrt{10}$
(b) $\sqrt2+\sqrt3$
(c) $5-\sqrt3$

In this case, I relied on my knowledge that $\sqrt{10} \approx 3.16$ and $\sqrt2\approx 1.41$ and $\sqrt3 \approx 1.73$ to find (a) $\approx 3.16$, (b) $\approx ~3.14$ and (c) $\approx ~3.27$ so that the required answer is (c).

But this seemed inelegant: I felt there might be some way to manipulate the surd expressions to make the ordering more explicit. I can't see what that might be, however (squaring all the expressions didn't really help).

I'd appreciate some views: am I missing a trick, or was this particular question simply testing knowledge of some common values?

EDIT: after the very helpful answers, which certainly showed that there was a much satisfying and general way of approaching the original question, can I also ask about another version of the question which included (d) $\sqrt[4]{101}$.

When approaching the question by approximation, I simply observed that $\sqrt[4]{101}$ is only a tiny bit greater than $\sqrt{10}$, and hence it still was clear to choose (c) as the answer. Is there any elegant way to extend the more robust methods to handle this case?

• +1 for providing context (your first two sentences), something that nearly all questions at this level fail to do, and for providing a nice explanation of your concern. Incidentally, for math aptitude and other tests, it has always been my understanding that the questions are NOT testing whether you know the approximations, but whether you can perform the type of analysis in the answer by @Lord Shark the Unknown. Of course, unless the question writer puts some effort behind writing such questions, such questions can often be solved by your method. Sep 19 '18 at 10:41
• Thank you for all the comments and answers. I am pleased I chose to ask the question at MSE -- there was indeed something to learn here! Sep 20 '18 at 10:05

Comparing $$\sqrt{10}$$ and $$\sqrt2+\sqrt3$$ is the same as comparing $$10$$ and $$(\sqrt2+\sqrt3)^2=5+2\sqrt6$$. That's the same as comparing $$5$$ and $$2\sqrt6$$. Which of these is bigger?

Likewise comparing $$\sqrt{10}$$ and $$5-\sqrt3$$ is the same as comparing $$10$$ and $$(5-\sqrt3)^2=28-10\sqrt3$$. That's the same as comparing $$10\sqrt3$$ and $$18$$.

Which of these is bigger?

• Ah .... yes of course ... $5=\sqrt{25}>\sqrt{24}=2\sqrt6$ Sep 19 '18 at 10:30
• Thank you for the hint! Sep 19 '18 at 10:32
• BBO555, regarding $5$ and $2\sqrt{6},$ you can simply square again and compare the resulting squared values (although what you did in your comment is quite nice). This relies on the property that, when $a$ and $b$ are positive (or even when they are nonnegative), then we have: $a < b$ if and only if $a^2 < b^2$ (this can be "seen" by considering the graph of $y = x^2$ for $x\geq0).$ Incidentally, the analogous result for cubing also is true and the result for cubing doesn't require the numbers to be positive (consider the graph of $y = x^3).$ Sep 19 '18 at 10:47

You can use:

(1) the fact that $f(x)=x^2$ is a monotonically increasing function when $x\geq0$ and

(2) the arithmetic-geometric mean inequality $\sqrt{ab}\leq\frac{a+b}{2}$, when $a, b\geq0$. Hence, $$(\sqrt{2}+\sqrt{3})^2=5+2\sqrt{2\cdot3}\leq5+2\frac{2+3}{2}=5+5=10=(\sqrt{10})^2$$ Therefore, using (1), we obtain $\sqrt{2}+\sqrt{3}\leq 10$. I forgot about this: $$5-\sqrt{3}=3+2-\sqrt{3}=3+\frac{1}{2+\sqrt{3}}\geq3+\frac{1}{2+2}=3.25$$ One can easily verify that $(3+1/4)^2>10.5>10$. One also finds that $10.5^2>110>101$.

Then, performing argument (1) twice, one finds that $5-\sqrt{3}>(101)^{1/4}$.

## Consequently, $5-\sqrt{3}$ is the bigger number.

• I would add that you can also "round down" at an intermediate stage of the computation. If you are trying to prove $a \gt b$ sometimes you can find an expression $c$ that is simpler than $a$ where $a \gt c$, do some more manipulation, and show $c \gt b$. The numerical estimates are useful for this because they tell you how much room you have. You might find that rough approximations work, or you might need to be quite careful. Sep 19 '18 at 14:01
• Thanks for including a route to handling case (d) ! Sep 20 '18 at 8:07

## 2.2 We will remove the duplicate examples from the entire dataset

In [119]:
def deduplication(ds):
    
    def dedup_func(x):
        """Use this function to remove duplicate entries"""
        if x["text"] in unique_text:
            return False
        else:
            unique_text.add(x["text"])
            return True

    unique_text = set()

    ds = ds.filter(dedup_func, 
            load_from_cache_file= False, 
            num_proc = 1)
    return ds 


pretrained_dataset=deduplication(dataset) 


Filter:   0%|          | 0/891 [00:00<?, ? examples/s]

In [120]:
# No records are dropped we can state the dataset is not having any duplicates 
pretrained_dataset

Dataset({
    features: ['text'],
    num_rows: 891
})

## 2.3 Quality filter 

Here we will be removing any text examples that are in language other than English. The code here uses a language detection model called fastText

In [121]:
# load the model path 
model_path =  hf_hub_download(repo_id = "facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path) # load fasttext language identification model

# text out the model is working is fine or not
model.predict("Hello World!!")

(('__label__nob_Latn',), array([0.35037538]))

In [124]:
def getEnglishlanguageDetectionscore(x):
    # english language score
    language, score = model.predict(x["text"].replace("\n", ""))
    language = language[0].split("__")[2]
    
    if score[0] >= 0.40 and language == "eng_Latn":
        return True
    else:
        return False

pretraine_dataset = pretrained_dataset.filter(
    getEnglishlanguageDetectionscore, 
    load_from_cache_file = False
)

Filter:   0%|          | 0/891 [00:00<?, ? examples/s]

In [None]:
# Most of over text contains language as english, so we haven't excluded any information
pretrained_dataset

Dataset({
    features: ['text'],
    num_rows: 891
})

# Save the dataset

In [127]:
file_path = "./data/preprocessed_dataset.parquet"
pretrained_dataset.to_parquet(file_path)

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

5256301