In [1]:
from google_ngrams import google_ngram, TimeSeries

The `google_ngram` function supports different varieties of English (e.g., British, American) and allows aggregation by year or decade. Word forms (even a single word form) must be formatted as a list:

The following would return counts for the word x-ray in US English by year:

In [2]:
xray_year = google_ngram(word_forms = ["x-ray"], variety = "us", by = "year")


Accessing repository. For larger ones
(e.g., ngrams containing 2 or more words).
This may take a few minutes...



In [276]:


def google_ngram(word_forms,
                 variety="eng",
                 by="decade"):
    variety_types = ["eng", "gb", "us", "fiction"]
    if variety not in variety_types:
        raise ValueError("""variety_types
                         Invalid variety type. Expected one of: %s
                         """ % variety_types)
    by_types = ["year", "decade"]
    if by not in by_types:
        raise ValueError("""variety_types
                         Invalid by type. Expected one of: %s
                         """ % by_types)
    word_forms = [re.sub(r'([a-zA-Z0-9])-([a-zA-Z0-9])',
                         r'\1 - \2', wf) for wf in word_forms]
    word_forms = [wf.strip() for wf in word_forms]
    n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
    n = list(set(n))

    if len(n) > 1:
        raise ValueError("""Check spelling.
                         Word forms should be lemmas of the same word
                         (e.g. 'teenager' and 'teenagers'
                         or 'walk', 'walks' and 'walked'
                         """)
    if n[0] > 5:
        raise ValueError("""Ngrams can be a maximum of 5 tokens.
                         Hyphenated words are split and include the hyphen,
                         so 'x-ray' would count as 3 tokens.
                         """)

    gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]
    gram = list(set([g.lower() for g in gram]))

    if len(gram) > 1:
        raise ValueError("""Check spelling.
                         Word forms should be lemmas of the same word
                         (e.g. 'teenager' and 'teenagers'
                         or 'walk', 'walks' and 'walked'
                         """)

    if re.match(r'^[a-z][^a-z]', gram[0]):
        gram[0] = re.sub(r'[^a-z]', '_', gram[0])
    if re.match(r'^[0-9]', gram[0]):
        gram[0] = gram[0][:1]
    if re.match(r'^[\W]', gram[0]):
        gram[0] = "punctuation"

    if any(re.match(r'^[ßæðøłœıƒþȥəħŋªºɣđĳɔȝⅰʊʌʔɛȡɋⅱʃɇɑⅲ]', g) for g in gram):
        gram[0] = "other"

    gram[0] = gram[0].encode('latin-1', 'replace').decode('latin-1')

    if variety == "eng":
        repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz"  # noqa: E501
    else:
        repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-{variety}-all-{n[0]}gram-20120701-{gram[0]}.gz"  # noqa: E501

    print(dedent(
        """
        Accessing repository. For larger ones
        (e.g., ngrams containing 2 or more words).
        This may take a few minutes. A progress bar should appear shortly...
        """
    ))

    word_forms = [re.sub(
        r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*)',
        r'\\\1', wf
        ) for wf in word_forms]

    grep_words = "|".join([f"^{wf}$" for wf in word_forms])

    # Read the data from the google repository and format
    df = pl.scan_csv(repo, separator='\t', has_header=False)
    filtered_df = df.filter(
        pl.col("column_1").str.contains(r"(?i)" + grep_words)
        )
    all_grams = filtered_df.collect()

    all_grams = (
        all_grams
        .rename(
            {"column_1": "Token",
             "column_2": "Year",
             "column_3": "AF"}
             )
        ).drop("column_4")

    # read totals
    if variety == "eng":
        f_path = "/Users/davidwestbrown/Documents/GitHub/google_ngrams/google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet"
    elif variety == "gb":
        f_path = "/Users/davidwestbrown/Documents/GitHub/google_ngrams/google_ngrams/data/googlebooks_gb_all_totalcounts_20120701.parquet"
    elif variety == "us":
        f_path = "/Users/davidwestbrown/Documents/GitHub/google_ngrams/google_ngrams/data/googlebooks_us_all_totalcounts_20120701.parquet"
    total_counts = pl.read_parquet(f_path)
    total_counts = total_counts.cast({"Year": pl.UInt32,
                                      "Total": pl.UInt64,
                                      "Pages": pl.UInt64,
                                      "Volumes": pl.UInt64})
    # format totals, fill missing data, and sum
    total_counts = total_counts.cast({
        "Year": pl.UInt32,
        "Total": pl.UInt64,
        "Pages": pl.UInt64,
        "Volumes": pl.UInt64
        })

    total_counts = (
        total_counts
        .with_columns(
            pl.col("Year")
            .cast(pl.String).str.to_datetime("%Y")
            )
        .sort("Year")
        .upsample(time_column="Year", every="1y")
        .with_columns(
            pl.col(["Total", "Pages", "Volumes"])
            .fill_null(strategy="zero")
            )
            )
    total_counts = (
        total_counts
        .group_by_dynamic(
            "Year", every="1y"
        ).agg(pl.col("Total").sum())
    )

    # sum token totals, convert to datetime and fill in missing years
    sum_tokens = all_grams.group_by("Year", maintain_order=True).sum()
    sum_tokens = (
        sum_tokens
        .with_columns(
            pl.col("Year")
            .cast(pl.String).str.to_datetime("%Y")
            )
        .sort("Year")
        .upsample(time_column="Year", every="1y")
        .with_columns(
                pl.col("AF")
                .fill_null(strategy="zero")
                )
        ).drop("Token")
    # join with totals
    sum_tokens = sum_tokens.join(total_counts, on="Year")

    if by == "decade":
        sum_tokens = (
            sum_tokens
            .group_by_dynamic("Year", every="10y")
            .agg(pl.col(["AF", "Total"]).sum())
        )
    # normalize RF per million tokens
    sum_tokens = (
        sum_tokens
        .with_columns(
            RF=pl.col("AF").truediv("Total").mul(1000000)
            )
        .with_columns(
            pl.col("RF").fill_nan(0)
            )
    )
    sum_tokens.insert_column(1, (pl.lit(word_forms)).alias("Token"))

    if by == "decade":
        sum_tokens = sum_tokens.rename({"Year": "Decade"})

    return sum_tokens


In [279]:
test = ["x-ray", "x-rays"]
df = google_ngram(word_forms=test)


Accessing repository. For larger ones
(e.g., ngrams containing 2 or more words).
This may take a few minutes. A progress bar should appear shortly...



In [282]:
df

Decade,Token,AF,Total,RF
datetime[μs],list[str],i64,u64,f64
1710-01-01 00:00:00,"[""x - ray"", ""x - rays""]",2,4957704,0.403413
1720-01-01 00:00:00,"[""x - ray"", ""x - rays""]",0,100055757,0.0
1730-01-01 00:00:00,"[""x - ray"", ""x - rays""]",0,106672595,0.0
1740-01-01 00:00:00,"[""x - ray"", ""x - rays""]",0,114333194,0.0
1750-01-01 00:00:00,"[""x - ray"", ""x - rays""]",3,174238442,0.017218
…,…,…,…,…
1960-01-01 00:00:00,"[""x - ray"", ""x - rays""]",642661,31951305299,20.113764
1970-01-01 00:00:00,"[""x - ray"", ""x - rays""]",818220,41978793014,19.49127
1980-01-01 00:00:00,"[""x - ray"", ""x - rays""]",1101002,54316937799,20.269957
1990-01-01 00:00:00,"[""x - ray"", ""x - rays""]",1351957,82491900993,16.388966


In [91]:
word_forms = [wf.strip() for wf in word_forms]

In [92]:
word_forms

['quiz', 'quizzes']

In [93]:
n = [len(re.findall(r'\S+', wf)) for wf in word_forms]
n = list(set(n))

In [94]:
n

[1]

In [95]:
gram = [wf[:2] if n[0] > 1 else wf[:1] for wf in word_forms]

In [96]:
gram = list(set([g.lower() for g in gram]))

In [97]:
gram

['q']

In [98]:
repo = f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-{n[0]}gram-20120701-{gram[0]}.gz"

In [99]:
repo

'http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-q.gz'

In [100]:
word_forms = [re.sub(
    r'(\.|\?|\$|\^|\)|\(|\}|\{|\]|\[|\*)',
    r'\\\1', wf
    ) for wf in word_forms]
grep_words = "|".join([f"^{wf}$" for wf in word_forms])

In [101]:
df = pl.scan_csv(repo, separator='\t', has_header=False)

In [102]:
filtered_df = df.filter(pl.col("column_1").str.contains(r"(?i)" + grep_words))

In [205]:
all_grams = filtered_df.collect()

In [206]:
all_grams = all_grams.rename({"column_1": "Token", "column_2": "Year", "column_3": "AF"}).drop("column_4")

In [229]:
# all_grams = all_grams.with_columns(pl.col("Token").str.to_lowercase())
sum_tokens = all_grams.group_by("Year", maintain_order=True).sum()

In [230]:
sum_tokens = (
    sum_tokens
    .with_columns(
        pl.col("Year")
        .cast(pl.String).str.to_datetime("%Y")
        )
    .sort("Year")
    .upsample(time_column="Year", every="1y")
    .with_columns(
        pl.col("AF")
        .fill_null(strategy="zero")
        )
).drop("Token")

In [231]:
sum_tokens

Year,AF
datetime[μs],i64
1515-01-01 00:00:00,1
1516-01-01 00:00:00,0
1517-01-01 00:00:00,0
1518-01-01 00:00:00,0
1519-01-01 00:00:00,0
…,…
2004-01-01 00:00:00,63106
2005-01-01 00:00:00,60038
2006-01-01 00:00:00,57231
2007-01-01 00:00:00,58844


In [216]:
total_counts = pl.read_parquet("/Users/davidwestbrown/Documents/GitHub/google_ngrams/google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet")

In [217]:
total_counts = total_counts.cast({"Year": pl.UInt32, "Total": pl.UInt64, "Pages": pl.UInt64, "Volumes": pl.UInt64})
total_counts = (
    total_counts
    .with_columns(
        pl.col("Year")
        .cast(pl.String).str.to_datetime("%Y")
        )
    .sort("Year")
    .upsample(time_column="Year", every="1y")
    .with_columns(
        pl.col(["Total", "Pages", "Volumes"])
        .fill_null(strategy="zero")
        )
)

In [218]:
total_counts = total_counts.group_by_dynamic("Year", every="1y").agg(pl.col("Total").sum())

Year,Total,Pages,Volumes
datetime[μs],u64,u64,u64
1505-01-01 00:00:00,32059,231,1
1506-01-01 00:00:00,0,0,0
1507-01-01 00:00:00,49586,477,1
1508-01-01 00:00:00,0,0,0
1509-01-01 00:00:00,0,0,0
…,…,…,…
2004-01-01 00:00:00,14705541576,73346714,139616
2005-01-01 00:00:00,14425183957,72756812,138132
2006-01-01 00:00:00,15310495914,77883896,148342
2007-01-01 00:00:00,16206118071,82969746,155472


In [220]:
total_counts

Year,Total
datetime[μs],u64
1505-01-01 00:00:00,32059
1506-01-01 00:00:00,0
1507-01-01 00:00:00,49586
1508-01-01 00:00:00,0
1509-01-01 00:00:00,0
…,…
2004-01-01 00:00:00,14705541576
2005-01-01 00:00:00,14425183957
2006-01-01 00:00:00,15310495914
2007-01-01 00:00:00,16206118071


In [232]:
sum_tokens = sum_tokens.join(total_counts, on="Year")

In [234]:
sum_tokens

Year,AF,Total
datetime[μs],i64,u64
1515-01-01 00:00:00,1,289011
1516-01-01 00:00:00,0,0
1517-01-01 00:00:00,0,0
1518-01-01 00:00:00,0,0
1519-01-01 00:00:00,0,0
…,…,…
2004-01-01 00:00:00,63106,14705541576
2005-01-01 00:00:00,60038,14425183957
2006-01-01 00:00:00,57231,15310495914
2007-01-01 00:00:00,58844,16206118071


In [None]:
sum_tokens = sum_tokens.group_by_dynamic("Year", every="10y").agg(pl.col(["AF", "Total"]).sum())

In [240]:
sum_tokens = sum_tokens.with_columns(RF=pl.col("AF").truediv("Total").mul(1000000)).with_columns(
        pl.col("RF")
        .fill_nan(0)
        )

sum_tokens.insert_column(
    1,
    (pl.lit(word_forms)).alias("Token")
)

Year,Token,AF,Total,RF
datetime[μs],list[str],i64,u64,f64
1510-01-01 00:00:00,"[""quiz"", ""quizzes""]",1,289011,3.460076
1520-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,346894,0.0
1530-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,0,0.0
1540-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,5272,0.0
1550-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,0,0.0
…,…,…,…,…
1960-01-01 00:00:00,"[""quiz"", ""quizzes""]",40030,31951305299,1.252844
1970-01-01 00:00:00,"[""quiz"", ""quizzes""]",60364,41978793014,1.437964
1980-01-01 00:00:00,"[""quiz"", ""quizzes""]",111125,54316937799,2.045863
1990-01-01 00:00:00,"[""quiz"", ""quizzes""]",253610,82491900993,3.074362


In [241]:
sum_tokens

Year,Token,AF,Total,RF
datetime[μs],list[str],i64,u64,f64
1510-01-01 00:00:00,"[""quiz"", ""quizzes""]",1,289011,3.460076
1520-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,346894,0.0
1530-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,0,0.0
1540-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,5272,0.0
1550-01-01 00:00:00,"[""quiz"", ""quizzes""]",0,0,0.0
…,…,…,…,…
1960-01-01 00:00:00,"[""quiz"", ""quizzes""]",40030,31951305299,1.252844
1970-01-01 00:00:00,"[""quiz"", ""quizzes""]",60364,41978793014,1.437964
1980-01-01 00:00:00,"[""quiz"", ""quizzes""]",111125,54316937799,2.045863
1990-01-01 00:00:00,"[""quiz"", ""quizzes""]",253610,82491900993,3.074362


In [140]:
total_counts = total_counts.with_columns(pl.col("Year").cast(pl.String).str.replace(r'\d$', '0').alias("Decade"))

In [143]:
total_counts.group_by("Decade", maintain_order=True).sum().drop("Year")

Decade,Total,Pages,Volumes
str,u64,u64,u64
"""1500""",81645,708,2
"""1510""",289011,2197,1
"""1520""",346894,1606,4
"""1540""",5272,59,1
"""1560""",437693,2442,4
…,…,…,…
"""1960""",31951305299,165417309,324447
"""1970""",41978793014,216540849,438891
"""1980""",54316937799,271632048,535854
"""1990""",82491900993,406725509,781581
