In [1]:
from pyspark.sql.types import *

In [2]:
customSchema = StructType([
    StructField("Primary_Index", IntegerType(), True), 
    StructField("Tweet_Id", IntegerType(), True), 
    StructField("Date_Text", StringType(), True),
    StructField("Flag", StringType(), True), 
    StructField("User", StringType(), True),
    StructField("Tweet_Text", StringType(), True)])

df = spark.read.load('hdfs://localhost:9000/CA2/ProjectTweets.csv', format="csv", header="False", sep=',', schema=customSchema)


In [3]:
from pyspark.sql.functions import regexp_replace
# Remove commas from the "Tweet_Text" column

filtered_df = df.filter(df["Tweet_Id"] == '1467811594')

filtered_df = filtered_df.withColumn("Tweet_Text", regexp_replace(filtered_df["Tweet_Text"], ",", ""))

# Select only the "Tweet_Text" column from the filtered DataFrame
result = filtered_df.select("Tweet_Text")

# Show the content of column "Tweet_Text"
result.show(truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------------------------+
|Tweet_Text                                                                                      |
+------------------------------------------------------------------------------------------------+
|@LOLTrish hey  long time no see! Yes.. Rains a bit only a bit  LOL  I'm fine thanks  how's you ?|
+------------------------------------------------------------------------------------------------+



                                                                                

In [4]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql import SparkSession

# Tokenize the "Tweet_Text" column
tokenizer = Tokenizer(inputCol="Tweet_Text", outputCol="words")
filtered_df = tokenizer.transform(filtered_df)

# Remove stopwords from the "words" column
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_text")
filtered_df = stopwords_remover.transform(filtered_df)

# Show the modified DataFrame
filtered_df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------------+----------+--------------------+--------+----+--------------------+--------------------+--------------------+
|Primary_Index|  Tweet_Id|           Date_Text|    Flag|User|          Tweet_Text|               words|       filtered_text|
+-------------+----------+--------------------+--------+----+--------------------+--------------------+--------------------+
|            7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|coZZ|@LOLTrish hey  lo...|[@loltrish, hey, ...|[@loltrish, hey, ...|
+-------------+----------+--------------------+--------+----+--------------------+--------------------+--------------------+



                                                                                

In [8]:
result = filtered_df.select("filtered_text")

# Show the content of column "Tweet_Text"
result.show(truncate=False)

[Stage 9:>                                                          (0 + 1) / 1]

+----------------------------------------------------------------------------------------+
|filtered_text                                                                           |
+----------------------------------------------------------------------------------------+
|[@loltrish, hey, , long, time, see!, yes.., rains, bit, bit, , lol, , fine, thanks, , ?]|
+----------------------------------------------------------------------------------------+



                                                                                