In [1]:
import torch
import os
from torch.utils.data.datapipes.iter import LoadFilesFromDisk, Zip

from torchdata.datapipes.iter import (
    HttpReader,
    IterableWrapper,
    SampleMultiplexer,
)

ROOT_DIR = os.path.expanduser('~/.torchdata/CC100')

In [2]:
# CC100 support (http://data.statmt.org/cc-100/)

URL="http://data.statmt.org/cc-100/%s.txt.xz"
VALID_CODES = [
    "am", "ar", "as", "az", "be", "bg", "bn", "bn_rom", "br", "bs", "ca", "cs", "cy", "da", "de", 
    "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gn", "gu", 
    "ha", "he", "hi", "hi_rom", "hr", "ht", "hu", "hy", "id", "ig", "is", "it", "ja", "jv", "ka", 
    "kk", "km", "kn", "ko", "ku", "ky", "la", "lg", "li", "ln", "lo", "lt", "lv", "mg", "mk", "ml", 
    "mn", "mr", "ms", "my", "my_zaw", "ne", "nl", "no", "ns", "om", "or", "pa", "pl", "ps", "pt", 
    "qu", "rm", "ro", "ru", "sa", "si", "sc", "sd", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", 
    "sw", "ta", "ta_rom", "te", "te_rom", "th", "tl", "tn", "tr", "ug", "uk", "ur", "ur_rom", "uz", 
    "vi", "wo", "xh", "yi", "yo", "zh-Hans", "zh-Hant", "zu",    
]

def CC100(root, language_code, use_caching=True):
    if language_code not in VALID_CODES:
        raise ValueError(f"Invalid language code {language_code}")
    url = URL % language_code
    if use_caching:
        cache_compressed_dp = IterableWrapper([url]).on_disk_cache(
            HttpReader, 
            op_map=lambda x: (x[0], x[1].read()), 
            filepath_fn=lambda x: os.path.join(root, os.path.basename(x)))
        
        cache_decompressed_dp = cache_compressed_dp.map(lambda x: (x[0])).on_disk_cache(
            lambda x: LoadFilesFromDisk(x).read_from_xz(),
            op_map=lambda x: (x[0], x[1].read()),
            filepath_fn=lambda x: os.path.join(root, os.path.basename(x).rstrip(".xz")))        
        
        data_dp = cache_decompressed_dp
    else:
        data_dp = HttpReader([url]).read_from_xz()
    units_dp = data_dp.readlines().map(lambda x: (language_code, x[1])).map(lambda x: (x[0], x[1].decode()))
    return units_dp


In [3]:
# Sample from multi-gigabyte-size compressed dataset without downloading the whole thing
# This executes very fast
import time
start_time = time.time()
for i, x in enumerate(CC100(ROOT_DIR, 'en', use_caching=False)):
    print(x)
    if i > 5:
        break
print(f"Execution time {(time.time() - start_time):.2f} secs")



('en', 'Belmont Estate is on the market for $63 million and boasts roughly 22,000 square feet of luxurious finishes and elaborate architecture on 1.28 acres. Listed on Thursday, the home is being sold by high-end real estate firm Sotheby’s International Realty Canada.')
('en', '“Within the city we’ve had homes that have sold for $56 million, $33 million, $31 million but this will be the record of the offering price,” listing agent Christa Frosch of Sotheby’s tells BuzzBuzzNews.')
('en', 'The three-storey home has five bedrooms, twelve bathrooms and an elevator in the west wing. Built to entertain, two main gallery halls can seat up to 100 guests. The Italian-inspired kitchen includes a fireplace and walls and ceilings throughout the home feature murals and artwork. Lavish amenities include an indoor pool and sauna, a six-car garage and a private entrance in-law’s suite.')
('en', 'Surrounding the property is a Versailles-inspired garden with a variety of trees, plants and an orchard. In

In [4]:
# cache
# This cell is very slow to run the first time as it downloads a dataset from a very slow server
next(iter(CC100(ROOT_DIR, 'ha', use_caching=True)))

('ha',
 'Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .')

In [5]:
# cache
# This cell is very slow to run the first time as it downloads a dataset from a very slow server
next(iter(CC100(ROOT_DIR, 'yi', use_caching=True)))

('yi', 'קאַטעגאָריע:cs-- – װיקיװערטערבוך')

In [6]:
import itertools
# Cache two of the datasets. The backend rate-limits connections to 1 per ip, 
# so you can't have more than one dataset running without caching

# If you do "run all" this may fail because the previous http connections might still be alive

z1 = CC100(ROOT_DIR, 'li', use_caching=False).cycle()
z2 = CC100(ROOT_DIR, 'ha', use_caching=True).cycle()
z3 = CC100(ROOT_DIR, 'yi', use_caching=True).cycle()

z = SampleMultiplexer({z1: 0.7, z2: 0.2, z3: 0.1})

l = list(itertools.islice(z, 0, 500000))
print(l[0:20])

ratio = sum(1 for k,v in l if k == 'li') / len(l)
print(f"Expected ratio: 0.7, actual {ratio}")


[('li', "Kop van 't Ende - Wikipedia"), ('li', ''), ('li', "Coos is 'n in 1853 gestiech graofsjap in Oregon, VS. Coos is verneump nao de Cook-koo-oose, 'n inheims Amerikaans stam, die allewijl neet mie besteit. De hoofplaots vaan 't graofsjap is Coquille."), ('ha', 'Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .'), ('yi', 'קאַטעגאָריע:cs-- – װיקיװערטערבוך'), ('li', "'t Graofsjap heet 'n totaal oppervlak vaan 4.678 km² boevaan 4.145 km² land is en 533 km² water."), ('ha', "Kamfanin dillancin labaran IRNA na kasar Iran ya nakalto Ahmad Abu-Zaid kakakin ma'aikatar harkokin wajen kasar Masar yarjejeniyar da kasar Masar ta cimma da kasar Cyprus kan iyakokin da kowanne daga cikinsu yake mallaka daga gabacin tekun Mediterranean ta zama doka ce, kuma duk wanda yayi kokarin taka ta Masar zata kalubalance shi."), ('ha', 'Abu-Zaid ya kara da cewa yarjejeniyar rabon kan iyaka a cik

In [8]:
next(iter(CC100(ROOT_DIR, 'ha', use_caching=False).lines_to_paragraphs()))

('ha',
 "Dangantaka tsakanin kasashen Masar da Turkiya ta yi tsami a cikin yan kwanakin nan, saboda sanin iyakokin da kowanne daga cikin yake mallaka a tekun Mediterranean .\nKamfanin dillancin labaran IRNA na kasar Iran ya nakalto Ahmad Abu-Zaid kakakin ma'aikatar harkokin wajen kasar Masar yarjejeniyar da kasar Masar ta cimma da kasar Cyprus kan iyakokin da kowanne daga cikinsu yake mallaka daga gabacin tekun Mediterranean ta zama doka ce, kuma duk wanda yayi kokarin taka ta Masar zata kalubalance shi.\nAbu-Zaid ya kara da cewa yarjejeniyar rabon kan iyaka a cikin tekun Mediterranean , yarjejjeniya ce ta kasa da kasa wacce Majalisar dinkin duniya ta amince da ita.\nAmma ministan harkokin wajen kasar Turkiya Maulud Chavis-Uglu, a ranar litinin da ta gabata ce ya bada sanarwan cewa kasar Turkiya ba ta amince da yarjejeniyar da kasashen Masar ta Cyprus suka cimma kan rabon kan iyaka da kuma amfani da tekun Mediterranean a shekara ta 2013 ba.\nWani Sabon Sabani Ya Kunno kai Tsakanin Kasa