## Python vs pandas vs pyspark

### single file

In [5]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


spark = SparkSession.builder.appName(
    "Counting word occurences from a book."
).getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [15]:
single_file = "../../data/gutenberg_books/1342-0.txt"

In [16]:
def count_words_with_pyspark():
    # If you need to read multiple text files, replace `1342-0` by `*`.
    results = (
        spark.read.text(single_file)
        .select(F.split(F.col("value"), " ").alias("line"))
        .select(F.explode(F.col("line")).alias("word"))
        .select(F.lower(F.col("word")).alias("word"))
        .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
        .where(F.col("word") != "")
        .groupby(F.col("word"))
        .count()
    )

    return results.orderBy("count", ascending=False)


In [17]:
results = count_words_with_pyspark()
results.show()

+----+-----+
|word|count|
+----+-----+
| the| 4480|
|  to| 4218|
|  of| 3711|
| and| 3504|
| her| 2199|
|   a| 1982|
|  in| 1909|
| was| 1838|
|   i| 1749|
| she| 1668|
|that| 1487|
|  it| 1482|
| not| 1427|
| you| 1300|
|  he| 1296|
|  be| 1257|
| his| 1247|
|  as| 1174|
| had| 1170|
|with| 1092|
+----+-----+
only showing top 20 rows



In [18]:
%%timeit 

results = count_words_with_pyspark()
results.collect()

536 ms ± 9.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
from collections import Counter
import re

def count_words_with_python():
    word_counter = Counter()
    with open(single_file, "r", encoding="utf-8") as fh:
        for line in fh:
            words = [re.search("([a-z]*)", w.lower()).group(1) for w in line.split()]
            words = [w for w in words if w]
            word_counter.update(Counter(words))
    
    return word_counter
    
    

In [46]:
results = count_words_with_python()
results.most_common(20)

[('the', 4479),
 ('to', 4218),
 ('of', 3711),
 ('and', 3504),
 ('her', 2199),
 ('a', 1982),
 ('in', 1909),
 ('was', 1838),
 ('i', 1750),
 ('she', 1668),
 ('that', 1487),
 ('it', 1482),
 ('not', 1427),
 ('you', 1301),
 ('he', 1296),
 ('be', 1257),
 ('his', 1247),
 ('as', 1174),
 ('had', 1170),
 ('with', 1092)]

In [47]:
%%timeit

results = count_words_with_python()
results.most_common(20)

180 ms ± 3.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [161]:
import pandas as pd

def count_words_with_pandas():
    df = pd.read_csv(single_file, sep=r"\n", header=None, engine="python")
    s = df.iloc[:,0].str.split().explode().str.lower()
    df = s.str.extract("([a-z]*)").reset_index(drop=True)
    df = df[(df != "")].dropna()
    return df

In [162]:
df = count_words_with_pandas()
df.value_counts()[:20]

the     4480
to      4218
of      3711
and     3504
her     2199
a       1982
in      1909
was     1838
i       1750
she     1668
that    1487
it      1482
not     1427
you     1301
he      1296
be      1257
his     1247
as      1174
had     1170
with    1092
dtype: int64

In [163]:
%%timeit

df = count_words_with_pandas()
df.value_counts()[:20]

200 ms ± 19.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### multiple files

In [147]:
dir = "../../data/gutenberg_books/"

In [148]:
def count_words_with_pyspark():
    # If you need to read multiple text files, replace `1342-0` by `*`.
    results = (
        spark.read.text(dir)
        .select(F.split(F.col("value"), " ").alias("line"))
        .select(F.explode(F.col("line")).alias("word"))
        .select(F.lower(F.col("word")).alias("word"))
        .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
        .where(F.col("word") != "")
        .groupby(F.col("word"))
        .count()
    )

    return results.orderBy("count", ascending=False)


In [149]:
results = count_words_with_pyspark()
results.show()

+----+-----+
|word|count|
+----+-----+
| the|38895|
| and|23919|
|  of|21199|
|  to|20526|
|   a|14464|
|   i|13973|
|  in|12777|
|that| 9623|
|  it| 9099|
| was| 8920|
| her| 7923|
|  my| 7385|
| his| 6642|
|with| 6575|
|  he| 6444|
|  as| 6439|
| you| 6295|
| had| 5718|
| she| 5617|
| for| 5425|
+----+-----+
only showing top 20 rows



In [152]:
%%timeit

results = count_words_with_pyspark()
results.collect()

1.18 s ± 68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [153]:
from collections import Counter
from pathlib import Path
import re

def count_words_with_python():
    word_counter = Counter()
    
    for file in Path(dir).glob("*.txt"):    
        with open(file, "r", encoding="utf-8") as fh:
            for line in fh:
                words = [re.search("([a-z]*)", w.lower()).group(1) for w in line.split()]
                words = [w for w in words if w]
                word_counter.update(Counter(words))
    
    return word_counter
    
    

In [154]:
results = count_words_with_python()
results.most_common(20)

[('the', 38894),
 ('and', 23919),
 ('of', 21199),
 ('to', 20526),
 ('a', 14464),
 ('i', 13974),
 ('in', 12777),
 ('that', 9623),
 ('it', 9099),
 ('was', 8920),
 ('her', 7923),
 ('my', 7385),
 ('his', 6642),
 ('with', 6575),
 ('he', 6444),
 ('as', 6439),
 ('you', 6297),
 ('had', 5718),
 ('she', 5617),
 ('for', 5425)]

In [155]:
%%timeit 

results = count_words_with_python()
results.most_common(20)

1.08 s ± 3.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [164]:
from pathlib import Path
import pandas as pd

def count_words_with_pandas():
    df = pd.concat([pd.read_csv(file, sep=r"\n", header=None, engine="python") for file in Path(dir).glob("*.txt")])
    s = df.iloc[:,0].str.split().explode().str.lower()
    df = s.str.extract("([a-z]*)").reset_index(drop=True)
    df = df[(df != "")].dropna()
    return df

In [165]:
df = count_words_with_pandas()
df.value_counts()[:20]

the     38895
and     23919
of      21199
to      20526
a       14464
i       13974
in      12777
that     9623
it       9099
was      8920
her      7923
my       7385
his      6642
with     6575
he       6444
as       6439
you      6297
had      5718
she      5617
for      5425
dtype: int64

In [166]:
%%timeit

df = count_words_with_pandas()
df.value_counts()[:20]

1.18 s ± 50.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
