In [31]:
from pyspark.sql import SparkSession, functions as f

In [4]:
spark = SparkSession.builder.getOrCreate()

In [6]:
df = spark.read.csv("dataset_Vaccine_Pfizer.csv", header=True, inferSchema=True)

In [7]:
df.createOrReplaceTempView("pfizer")

##### SCOPE

1. Schema of the dataset.
2. Number of tweets.
3. Polarity of users.
4. Count of contain pfizer.
5. Tallest tweet.
6. Most used word.
7. Subjectivity of targets.
8. Data cleaning.
9. Relationship between Subjectivity and Polarity. <br><br><br>
***  
<br>

- [x] Schema of the dataset.
- [x] Number of tweets.
- [x] Polarity of users.
- [x] Count of contain pfizer.
- [x] Tallest tweet.
- [ ] Most used word.
- [ ] Subjectivity of targets.
- [ ] Data cleaning.
- [ ] Relationship between Subjectivity and Polarity.

<br>

In [6]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Subjectivity: string (nullable = true)
 |-- Polarity: string (nullable = true)
 |-- Target: string (nullable = true)



In [12]:
spark.sql(
    """
    DESCRIBE pfizer;
    """
).show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|          id|   string|   NULL|
|        Text|   string|   NULL|
|Subjectivity|   string|   NULL|
|    Polarity|   string|   NULL|
|      Target|   string|   NULL|
+------------+---------+-------+



In [15]:
number_of_tweets = df.filter(df.Text.isNotNull()).count()
print(number_of_tweets)

1188


In [17]:
spark.sql(
    """
    SELECT COUNT(Text) AS number_of_tweets FROM pfizer WHERE Text IS NOT NULL;
    """
).show()

+----------------+
|number_of_tweets|
+----------------+
|            1188|
+----------------+



In [30]:
df.describe("Polarity").show()

+-------+-------------------+
|summary|           Polarity|
+-------+-------------------+
|  count|               1128|
|   mean|0.16325622765526784|
| stddev|0.25291704571613854|
|    min|       -0.008333333|
|    max|           Positive|
+-------+-------------------+



In [52]:
tolerance = 0.0001
df.withColumn("polarity_of_users", f.when(df.Polarity<-tolerance,"Negative")\
                                    .when((df.Polarity>-tolerance)&(df.Polarity<tolerance),"Neutral")\
                                    .when(df.Polarity>tolerance,"Positive")\
                                    .otherwise(None)).show(5)

+---------+--------------------+------------+--------+--------+-----------------+
|       id|                Text|Subjectivity|Polarity|  Target|polarity_of_users|
+---------+--------------------+------------+--------+--------+-----------------+
|        0|Historically ther...|        0.45|    0.35|Positive|         Positive|
|        1|Honored and Pfize...| 0.066666667|       0| Neutral|          Neutral|
|        2|COVID19 illuminat...|        NULL|    NULL|    NULL|             NULL|
|Next week| our Chief Develo...|           0|       0| Neutral|          Neutral|
|        3|Today we publishe...|           0|       0| Neutral|          Neutral|
+---------+--------------------+------------+--------+--------+-----------------+
only showing top 5 rows



In [57]:
spark.sql(
    """
    SELECT *,CASE 
            WHEN Polarity<-0.0001 THEN 'Negative'
            WHEN (Polarity>-0.0001)AND(Polarity<0.0001) THEN 'Neutral'
            WHEN Polarity>0.0001 THEN 'Positive'
            ELSE NULL
        END AS polarity_of_users 
    FROM pfizer;
    """
).show(5)

+---------+--------------------+------------+--------+--------+-----------------+
|       id|                Text|Subjectivity|Polarity|  Target|polarity_of_users|
+---------+--------------------+------------+--------+--------+-----------------+
|        0|Historically ther...|        0.45|    0.35|Positive|         Positive|
|        1|Honored and Pfize...| 0.066666667|       0| Neutral|          Neutral|
|        2|COVID19 illuminat...|        NULL|    NULL|    NULL|             NULL|
|Next week| our Chief Develo...|           0|       0| Neutral|          Neutral|
|        3|Today we publishe...|           0|       0| Neutral|          Neutral|
+---------+--------------------+------------+--------+--------+-----------------+
only showing top 5 rows



In [65]:
count_of_contain_pfizer = df.filter((df.Text.contains("pfizer"))|(df.Text.contains("Pfizer"))).count()
print(count_of_contain_pfizer)

174


In [73]:
spark.sql(
    """
    SELECT COUNT(Text) AS count FROM pfizer WHERE Text ILIKE '%pfizer%';
    """
).show(5)

+-----+
|count|
+-----+
|  174|
+-----+



In [120]:
char_count = df.select(f.max(f.length(df.Text))).alias("largest_text").collect()[0][0]
df.select("Text").filter(f.length(df.Text) == char_count).orderBy("Text").show(1, False)

+-----------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------+
|KnowTheFacts: Answers to your biggest questions about the progress of our COVID19 vaccine candidate &amp; more information on next steps.|
+-----------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [130]:
spark.sql(
    """
    SELECT Text FROM pfizer WHERE length(Text) = (SELECT MAX(LENGTH(Text)) FROM pfizer) ORDER BY Text LIMIT 1;
    """
).show()

+--------------------+
|                Text|
+--------------------+
|KnowTheFacts: Ans...|
+--------------------+



In [158]:
df_without_emoji = df.withColumn("without_emoji", f.regexp_replace(df.Text, r"[^\w\s]", ""))
df_lower_texts = df_without_emoji.withColumn("lower_texts", f.lower(df_without_emoji.without_emoji))
df_splitted = df_lower_texts.withColumn("splitted", f.split(df_lower_texts.lower_texts, " "))
df_exploded = df_splitted.withColumn("words", f.explode(df_splitted.splitted)).na.drop()
word_counts = df_exploded.filter(~(df_exploded.words.isNull()|(f.trim(df_exploded.words) == "")))
result = word_counts.groupBy(df_exploded["words"]).count()
result.orderBy("count", ascending=False).show(1)

+-----+-----+
|words|count|
+-----+-----+
|   to|  749|
+-----+-----+
only showing top 1 row

