In [6]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d9f144f34d7e1cc234daafd60bd2a9ec1fb6de38866de86dbd645c9b3e6f04be
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [7]:
from pyspark.sql import SparkSession

In [45]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover


In [46]:
from pyspark.ml import Pipeline

In [47]:
spark = SparkSession.builder.appName("TF-IDF Example").getOrCreate()

In [116]:
# Load the dataset
df = spark.read.csv("fb_sentiment.csv", header=True, inferSchema=True)

In [117]:
df.show()

+---+--------------------+--------------------+
|_c0|              FBPost|               Label|
+---+--------------------+--------------------+
|  0|Drug Runners and ...|                   O|
|  1|"Heres a single, ...| from Mackinac Is...|
|  2|If you tire of No...|                   O|
|  3|Ghost of Round Is...|                   O|
|  4|Why is Barnes and...|                   N|
|  5|@Maria:  Do you m...|                   P|
|  6|kindle is awesome...|                   P|
|  7|     I love mine!!!!|                   P|
|  8|Meh. I think Sing...|                   N|
|  9|My daugjhter love...|                   P|
| 10|I am not sure if ...|                   N|
| 11|Got a Kindle for ...|                   P|
| 12|I dont have the p...|                   P|
| 13|Love the new sing...|                   P|
| 14|Not a fan of Kind...|                   N|
| 15|Best thing since ...|                   P|
| 16|we love our kindl...|                   P|
| 17|can anybody tell ...|              

In [118]:
#Dropping Null Values
df = df.dropna(subset=["FBPost"])

In [119]:
# Tokenize the text data
tokenizer = Tokenizer(inputCol="FBPost", outputCol="words")
df_tokenized = tokenizer.transform(df)

In [120]:
df_tokenized.show()

+---+--------------------+--------------------+--------------------+
|_c0|              FBPost|               Label|               words|
+---+--------------------+--------------------+--------------------+
|  0|Drug Runners and ...|                   O|[drug, runners, a...|
|  1|"Heres a single, ...| from Mackinac Is...|["heres, a, singl...|
|  2|If you tire of No...|                   O|[if, you, tire, o...|
|  3|Ghost of Round Is...|                   O|[ghost, of, round...|
|  4|Why is Barnes and...|                   N|[why, is, barnes,...|
|  5|@Maria:  Do you m...|                   P|[@maria:, , do, y...|
|  6|kindle is awesome...|                   P|[kindle, is, awes...|
|  7|     I love mine!!!!|                   P| [i, love, mine!!!!]|
|  8|Meh. I think Sing...|                   N|[meh., i, think, ...|
|  9|My daugjhter love...|                   P|[my, daugjhter, l...|
| 10|I am not sure if ...|                   N|[i, am, not, sure...|
| 11|Got a Kindle for ...|        

In [121]:
#Dropping Label Columns
df_tokenized = df_tokenized.drop('Label')

In [122]:
df_tokenized.show()

+---+--------------------+--------------------+
|_c0|              FBPost|               words|
+---+--------------------+--------------------+
|  0|Drug Runners and ...|[drug, runners, a...|
|  1|"Heres a single, ...|["heres, a, singl...|
|  2|If you tire of No...|[if, you, tire, o...|
|  3|Ghost of Round Is...|[ghost, of, round...|
|  4|Why is Barnes and...|[why, is, barnes,...|
|  5|@Maria:  Do you m...|[@maria:, , do, y...|
|  6|kindle is awesome...|[kindle, is, awes...|
|  7|     I love mine!!!!| [i, love, mine!!!!]|
|  8|Meh. I think Sing...|[meh., i, think, ...|
|  9|My daugjhter love...|[my, daugjhter, l...|
| 10|I am not sure if ...|[i, am, not, sure...|
| 11|Got a Kindle for ...|[got, a, kindle, ...|
| 12|I dont have the p...|[i, dont, have, t...|
| 13|Love the new sing...|[love, the, new, ...|
| 14|Not a fan of Kind...|[not, a, fan, of,...|
| 15|Best thing since ...|[best, thing, sin...|
| 16|we love our kindl...|[we, love, our, k...|
| 17|can anybody tell ...|[can, anybody,

In [123]:
#Renaming _c0 column with PostID
df_tokenized = df_tokenized.withColumnRenamed("_c0", 'PostID')

In [124]:
df_tokenized.show()

+------+--------------------+--------------------+
|PostID|              FBPost|               words|
+------+--------------------+--------------------+
|     0|Drug Runners and ...|[drug, runners, a...|
|     1|"Heres a single, ...|["heres, a, singl...|
|     2|If you tire of No...|[if, you, tire, o...|
|     3|Ghost of Round Is...|[ghost, of, round...|
|     4|Why is Barnes and...|[why, is, barnes,...|
|     5|@Maria:  Do you m...|[@maria:, , do, y...|
|     6|kindle is awesome...|[kindle, is, awes...|
|     7|     I love mine!!!!| [i, love, mine!!!!]|
|     8|Meh. I think Sing...|[meh., i, think, ...|
|     9|My daugjhter love...|[my, daugjhter, l...|
|    10|I am not sure if ...|[i, am, not, sure...|
|    11|Got a Kindle for ...|[got, a, kindle, ...|
|    12|I dont have the p...|[i, dont, have, t...|
|    13|Love the new sing...|[love, the, new, ...|
|    14|Not a fan of Kind...|[not, a, fan, of,...|
|    15|Best thing since ...|[best, thing, sin...|
|    16|we love our kindl...|[w

In [125]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_filtered = remover.transform(df_tokenized)


In [126]:
df_filtered.show()

+------+--------------------+--------------------+--------------------+
|PostID|              FBPost|               words|      filtered_words|
+------+--------------------+--------------------+--------------------+
|     0|Drug Runners and ...|[drug, runners, a...|[drug, runners, ,...|
|     1|"Heres a single, ...|["heres, a, singl...|["heres, single,,...|
|     2|If you tire of No...|[if, you, tire, o...|[tire, non-fictio...|
|     3|Ghost of Round Is...|[ghost, of, round...|[ghost, round, is...|
|     4|Why is Barnes and...|[why, is, barnes,...|[barnes, nobles, ...|
|     5|@Maria:  Do you m...|[@maria:, , do, y...|[@maria:, , mean,...|
|     6|kindle is awesome...|[kindle, is, awes...|[kindle, awesome!...|
|     7|     I love mine!!!!| [i, love, mine!!!!]|    [love, mine!!!!]|
|     8|Meh. I think Sing...|[meh., i, think, ...|[meh., think, sin...|
|     9|My daugjhter love...|[my, daugjhter, l...|[daugjhter, loves...|
|    10|I am not sure if ...|[i, am, not, sure...|[sure, got, up

In [128]:
from pyspark.sql.functions import explode, col

In [157]:
df_exploded = df_filtered.select(explode("filtered_words").alias("word"))

# Count the distinct words
num_unique_words = df_exploded.select("word").distinct().count()

# Print the number of unique words
print("Number of unique words in the dataset:", num_unique_words)

Number of unique words in the dataset: 4063


In [158]:
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=8400)
df_tf = hashingTF.transform(df_filtered)

In [131]:
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
idfModel = idf.fit(df_tf)
df_tfidf = idfModel.transform(df_tf)

In [159]:
df_tfidf.show()

+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|PostID|              FBPost|               words|      filtered_words|         rawFeatures|      tfidf_features|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     0|Drug Runners and ...|[drug, runners, a...|[drug, runners, ,...|(8400,[260,1253,2...|(8400,[260,1253,2...|
|     1|"Heres a single, ...|["heres, a, singl...|["heres, single,,...|(8400,[287,674,13...|(8400,[287,674,13...|
|     2|If you tire of No...|[if, you, tire, o...|[tire, non-fictio...|(8400,[1749,2025,...|(8400,[1749,2025,...|
|     3|Ghost of Round Is...|[ghost, of, round...|[ghost, round, is...|(8400,[287,1476,1...|(8400,[287,1476,1...|
|     4|Why is Barnes and...|[why, is, barnes,...|[barnes, nobles, ...|(8400,[357,1130,4...|(8400,[357,1130,4...|
|     5|@Maria:  Do you m...|[@maria:, , do, y...|[@maria:, , mean,...|(8400,[183,357,95

In [160]:
# Increase maximum column width and row count to display
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 200)
spark.conf.set("spark.sql.repl.eagerEval.maxNumColumns", 200)

# Show the DataFrame with all content
df_tfidf.show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------