In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('reading-text-file').getOrCreate()

In [6]:
input_path = 'data/text-file.txt'

text_file_df = spark.read.text(paths=input_path)

text_file_df.show()

                                                                                

+--------------------+
|               value|
+--------------------+
|people are not as...|
|as they walk or a...|
|they are only as ...|
|as they care as t...|
|                    |
|people shine not ...|
|or the way they c...|
|true beauty is mo...|
|it's in the kindn...|
|                    |
|in every smile, i...|
|in moments of joy...|
|people are beauti...|
|when they dare to...|
|                    |
|so look beyond th...|
|the beauty in you...|
|for we are all be...|
|as we live, as we...|
+--------------------+


In [7]:
# Split each line into words array
array_of_words = split(str=text_file_df['value'], pattern=' ').alias('array_of_words')

array_of_words_df = text_file_df.select(array_of_words)

array_of_words_df.show()

word_count = array_of_words_df.count()

print(f'Words count is {word_count}')

+--------------------+
|      array_of_words|
+--------------------+
|[people, are, not...|
|[as, they, walk, ...|
|[they, are, only,...|
|[as, they, care, ...|
|                  []|
|[people, shine, n...|
|[or, the, way, th...|
|[true, beauty, is...|
|[it's, in, the, k...|
|                  []|
|[in, every, smile...|
|[in, moments, of,...|
|[people, are, bea...|
|[when, they, dare...|
|                  []|
|[so, look, beyond...|
|[the, beauty, in,...|
|[for, we, are, al...|
|[as, we, live,, a...|
+--------------------+
Words count is 19


In [18]:
# Split each line into words array, then explode the array to create a new row for each word
array_of_words = split(str=text_file_df['value'], pattern=' ')

exploded_array_of_words = explode(col=array_of_words).alias('exploded_array_of_words')

array_of_words_df = text_file_df.select(exploded_array_of_words)

array_of_words_df.show()

word_count = array_of_words_df.count()

print(f'Words count is {word_count}')

txt_output_path = 'output/output1'
csv_output_path = 'output/output2'

array_of_words_df.write.mode('overwrite').text(path=txt_output_path)
array_of_words_df.write.csv(path=csv_output_path, mode='overwrite', header=True)

+-----------------------+
|exploded_array_of_words|
+-----------------------+
|                 people|
|                    are|
|                    not|
|                     as|
|              beautiful|
|                     as|
|                   they|
|                  look,|
|                     as|
|                   they|
|                   walk|
|                     or|
|                     as|
|                   they|
|                  talk.|
|                   they|
|                    are|
|                   only|
|                     as|
|              beautiful|
+-----------------------+
Words count is 122


In [25]:
# Split each line into an array of words
array_of_words = split(str=text_file_df['value'], pattern='\\s+')
array_of_words_df = text_file_df.select(array_of_words.alias('array_of_words'))
array_of_words_df.show()

# Convert the array of words back to a string
string_of_words_df = array_of_words_df.select(concat_ws(" ", "array_of_words").alias("words"))
# string_of_words_df.show()

rows = string_of_words_df.collect()

for row in rows:
    print(f'Line: {row["words"]}')

# Save the DataFrame to a text file
text_output_path = 'output/output3'
string_of_words_df.write.mode('overwrite').text(path=text_output_path)

+--------------------+
|      array_of_words|
+--------------------+
|[people, are, not...|
|[as, they, walk, ...|
|[they, are, only,...|
|[as, they, care, ...|
|                  []|
|[people, shine, n...|
|[or, the, way, th...|
|[true, beauty, is...|
|[it's, in, the, k...|
|                  []|
|[in, every, smile...|
|[in, moments, of,...|
|[people, are, bea...|
|[when, they, dare...|
|                  []|
|[so, look, beyond...|
|[the, beauty, in,...|
|[for, we, are, al...|
|[as, we, live,, a...|
+--------------------+
Line: people are not as beautiful as they look,
Line: as they walk or as they talk.
Line: they are only as beautiful as they love,
Line: as they care as they share.
Line: 
Line: people shine not by the clothes they wear,
Line: or the way they comb their hair.
Line: true beauty is more than skin deep,
Line: it's in the kindness that they keep.
Line: 
Line: in every smile, in every tear,
Line: in moments of joy and when they fear,
Line: people are beautiful when they a

In [36]:
array_of_words = split(str=text_file_df['value'], pattern=" ")

words_df = text_file_df.select(explode(array_of_words).alias('word'))

words_by_count_df = words_df.groupBy('word').count().orderBy('count', ascending=False)

In [37]:
words_by_count_df.show()

+----------+-----+
|      word|count|
+----------+-----+
|      they|   13|
|        as|   11|
|        in|    7|
|       the|    6|
|       are|    5|
|        we|    4|
| beautiful|    4|
|    beauty|    3|
|      when|    3|
|    people|    3|
|          |    3|
|       not|    2|
|       and|    2|
|     every|    2|
|        or|    2|
|vulnerable|    1|
|     deep,|    1|
|   moments|    1|
|       joy|    1|
|     fear,|    1|
+----------+-----+


In [66]:
words_by_count_df.filter(words_by_count_df['count'] > 4).show()

+----+-----+
|word|count|
+----+-----+
|they|   13|
|  as|   11|
|  in|    7|
| the|    6|
| are|    5|
+----+-----+


In [55]:
words_df.groupBy('word').agg(count('*').alias('count')).orderBy('count', ascending=False).show()

+----+-----+
|word|count|
+----+-----+
|they|   13|
|  as|   11|
|  in|    7|
| the|    6|
| are|    5|
+----+-----+


In [120]:
words_by_count_df = words_df.groupBy('word') \
    .agg(expr('count(*) as count')) \
    .orderBy('count', ascending=False)

words_by_count_df.show()

json_output_path = 'output/words-by-count'
words_by_count_df.write.mode('overwrite').json(path=json_output_path)

+----------+-----+
|      word|count|
+----------+-----+
|      they|   13|
|        as|   11|
|        in|    7|
|       the|    6|
|       are|    5|
|        we|    4|
| beautiful|    4|
|    beauty|    3|
|      when|    3|
|    people|    3|
|          |    3|
|       not|    2|
|       and|    2|
|     every|    2|
|        or|    2|
|vulnerable|    1|
|     deep,|    1|
|   moments|    1|
|       joy|    1|
|     fear,|    1|
+----------+-----+


In [116]:
words_by_count_df = words_df.groupBy('word').agg(expr('count(*) as count')).filter(expr('count > 4')).orderBy('count',
                                                                                                              ascending=False)

for row in words_by_count_df.collect():
    print(f'Word: {row["word"]}, count: {row["count"]}')

json_output_path = 'output/words-by-count-bigger-than-four'
words_by_count_df.write.mode('overwrite').json(path=json_output_path)

Word: they, count: 13
Word: as, count: 11
Word: in, count: 7
Word: the, count: 6
Word: are, count: 5


In [71]:
words_df.createOrReplaceTempView("words_table")
spark.sql("SELECT word, COUNT(*) as count FROM words_table GROUP BY word ORDER BY count DESC").show()

+----------+-----+
|      word|count|
+----------+-----+
|      they|   13|
|        as|   11|
|        in|    7|
|       the|    6|
|       are|    5|
|        we|    4|
| beautiful|    4|
|    beauty|    3|
|      when|    3|
|    people|    3|
|          |    3|
|       not|    2|
|       and|    2|
|     every|    2|
|        or|    2|
|vulnerable|    1|
|     deep,|    1|
|   moments|    1|
|       joy|    1|
|     fear,|    1|
+----------+-----+


In [42]:
top_five_words = words_by_count_df.take(5)

for row in top_five_words:
    print(f'Word: {row["word"]}, count: {row["count"]}')

Word: they, count: 13
Word: as, count: 11
Word: in, count: 7
Word: the, count: 6
Word: are, count: 5
