## Open file and select text column

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BasicDFOperationsApp").master("spark://spark-master:7077").getOrCreate() 

In [3]:
df = spark.read.format("csv").option("header","true").load("/home/jovyan/data/vaccination_tweets.csv")
df.schema

StructType(List(StructField(id,StringType,true),StructField(user_name,StringType,true),StructField(user_location,StringType,true),StructField(user_description,StringType,true),StructField(user_created,StringType,true),StructField(user_followers,StringType,true),StructField(user_friends,StringType,true),StructField(user_favourites,StringType,true),StructField(user_verified,StringType,true),StructField(date,StringType,true),StructField(text,StringType,true),StructField(hashtags,StringType,true),StructField(source,StringType,true),StructField(retweets,StringType,true),StructField(favorites,StringType,true),StructField(is_retweet,StringType,true)))

In [12]:
df_text = df.select(['text'])
df_text.show(10,truncate=False)
print((df_text.count(), len(df_text.columns)))

+--------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF                                           |
|While the world has been on the wrong side of history this year, hopefully, the biggest vaccination effort we've ev… https://t.co/dlCHrZjkhm|
|#coronavirus #SputnikV #AstraZeneca #PfizerBioNTech #Moderna #Covid_19 Russian vaccine is created to last 2-4 years… https://t.co/ieYlCKBr8P|
|Facts are immutable, Senator, even when you're not ethically sturdy enough to acknowledge them. (1) You were born i… https://t.co/jqgV18kch4|

### Remove Null

In [15]:
df_text = df_text.na.drop()
df_text.show(5,truncate=False)
print((df_text.count(), len(df_text.columns)))

+--------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF                                           |
|While the world has been on the wrong side of history this year, hopefully, the biggest vaccination effort we've ev… https://t.co/dlCHrZjkhm|
|#coronavirus #SputnikV #AstraZeneca #PfizerBioNTech #Moderna #Covid_19 Russian vaccine is created to last 2-4 years… https://t.co/ieYlCKBr8P|
|Facts are immutable, Senator, even when you're not ethically sturdy enough to acknowledge them. (1) You were born i… https://t.co/jqgV18kch4|

### Cleaning

In [97]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Remove twitter handlers
df_clean = df_text.select('text', ((regexp_replace('text', '@[^\s]+', "")).alias('text2')))

# Remove hashtags
df_clean = df_clean.select('text2', ((regexp_replace('text2', r'\B#\S+', "")).alias('text3')))

# Remove URLS
df_clean = df_clean.select('text3', ((regexp_replace('text3', r"http\S+", "")).alias('text4')))

# Remove special characters
df_clean = df_clean.select('text4', ((regexp_replace('text4', r'[^\w+]', " ")).alias('text5')))

# Remove single characters 
df_clean = df_clean.select('text5', ((regexp_replace('text5', r'\s+[a-zA-Z]\s+', " ")).alias('text6')))

# Remove double spaces
# (r'\s+', ' ', x, flags=re.I))
df_clean = df_clean.select('text6', ((regexp_replace('text6', r'\s+', " ")).alias('text7')))

In [100]:
df_clean.select(['text7']).show(truncate = False)

+------------------------------------------------------------------------------------------------------------------+
|text7                                                                                                             |
+------------------------------------------------------------------------------------------------------------------+
|Same folks said daikon paste could treat cytokine storm                                                           |
|While the world has been on the wrong side of history this year hopefully the biggest vaccination effort we ve ev |
| Russian vaccine is created to last 2 4 years                                                                     |
|Facts are immutable Senator even when you re not ethically sturdy enough to acknowledge them 1 You were born      |
|Explain to me again why we need vaccine                                                                           |
|Does anyone have any useful advice guidance for whether the COV