In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Read CSV") \
    .getOrCreate()

# Read CSV file into a DataFrame
df = spark.read.load('hdfs://localhost:9000/user1/en_coronavirus_03_04_2020_thru_03_27_2020.csv',format='csv', header=True)


24/03/25 22:56:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

In [2]:
sc

In [3]:
# Show the DataFrame
df.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+------------------+--------+----------+-------------+
|             user_id|           status_id|          status_url|          created_at|  screen_name|                text|            source|is_quote|is_retweet|retweet_count|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+------------------+--------+----------+-------------+
|  750502701457440768| 1235354334923276289|https://twitter.c...|2020-03-04T23:59:59Z|RocketMan6510|Here are the 6 co...|              null|    null|      null|         null|
|1. Health care wo...|                null|                null|                null|         null|                null|              null|    null|      null|         null|
|2. Westchester at...|                null|                null|                null|         null|                null|          

In [4]:
# Count the number of rows
row_count = df.count()

# Get the list of column names and count the number of columns
column_count = len(df.columns)

print("Number of rows:", row_count)
print("Number of columns:", column_count)




Number of rows: 671515
Number of columns: 10


                                                                                

In [5]:
# Display the schema
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- status_id: string (nullable = true)
 |-- status_url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- text: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_quote: string (nullable = true)
 |-- is_retweet: string (nullable = true)
 |-- retweet_count: string (nullable = true)



In [6]:
# Display the summary stastics
df.describe().show()

24/03/25 22:57:05 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+--------------------+--------------------+--------------------+------------------+-----------+--------------------+--------------------+--------------------+-----------------+--------------------+
|summary|             user_id|           status_id|          status_url|        created_at|screen_name|                text|              source|            is_quote|       is_retweet|       retweet_count|
+-------+--------------------+--------------------+--------------------+------------------+-----------+--------------------+--------------------+--------------------+-----------------+--------------------+
|  count|              671508|              494948|              463761|            457582|     456004|              321452|              164017|              158709|           157191|              156605|
|   mean|3.319423476913961...|1.218941021433981...|   368.7757255936676|1185.4778761061948|   Infinity|   8048.884650235593|  1542.7500769467529|  4367.4402597402595|1976.40517

                                                                                

In [7]:
from pyspark.sql.functions import length

# Cast 'text' column to StringType
df_text = df.withColumn("text", df["text"].cast("string"))
df_text.show()

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+------------------+--------+----------+-------------+
|             user_id|           status_id|          status_url|          created_at|  screen_name|                text|            source|is_quote|is_retweet|retweet_count|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+------------------+--------+----------+-------------+
|  750502701457440768| 1235354334923276289|https://twitter.c...|2020-03-04T23:59:59Z|RocketMan6510|Here are the 6 co...|              null|    null|      null|         null|
|1. Health care wo...|                null|                null|                null|         null|                null|              null|    null|      null|         null|
|2. Westchester at...|                null|                null|                null|         null|                null|          

In [8]:
# Grouping the data and the count
from pyspark.sql.functions import collect_list

# Group by 'source' column 
grouped_df = df.groupBy('source').count().show()
grouped_df

[Stage 9:>                                                          (0 + 2) / 2]

+--------------------+-----+
|              source|count|
+--------------------+-----+
|"" says @SenTedCr...|    3|
|           GaggleAMP|    8|
| etc. https://t.c...|    1|
| lax labor laws a...|    2|
|        Tabtter Free|    4|
| the threat of a ...|   29|
| I never imagined...|    1|
| including aged g...|    1|
| later developed ...|    2|
| returned with #C...|    1|
| said a city gove...|    1|
|                2162|   25|
|"" promising to n...|    1|
|MonkeyViral Auto ...|    7|
| if there’s a mes...|    5|
|                 125|    1|
| we’re all in thi...|    4|
| I replied Philip...|    1|
| it's Taiwan. I'v...|    1|
|                ""No|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [9]:
#Grouping the data and aggregating
grouped_df2 = df.groupBy('source').agg({'is_retweet':'avg'})
grouped_df2.show()



+--------------------+---------------+
|              source|avg(is_retweet)|
+--------------------+---------------+
|"" says @SenTedCr...|           null|
|           GaggleAMP|           null|
| etc. https://t.c...|           null|
| lax labor laws a...|           null|
|        Tabtter Free|           null|
| the threat of a ...|           null|
| I never imagined...|           null|
| including aged g...|           null|
| later developed ...|           null|
| returned with #C...|           null|
| said a city gove...|           null|
|                2162|           null|
|"" promising to n...|           null|
|MonkeyViral Auto ...|           null|
| if there’s a mes...|           null|
|                 125|           null|
| we’re all in thi...|           null|
| I replied Philip...|           null|
| it's Taiwan. I'v...|           null|
|                ""No|           null|
+--------------------+---------------+
only showing top 20 rows



                                                                                

In [10]:
# Identify missing values in the DataFrame
missing_values_df = df.select([df['text'].isNull().alias('text') for column in df.columns])

# Show the DataFrame with missing values
missing_values_df.show()

+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| text| text| text| text| text| text| text| text| text| text|
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|false|false|false|false|false|false|false|false|false|false|
| true| true| true| true| true| true| true| true| true| true|
| true| true| true| true| true| true| true| true| true| true|
| true| true| true| true| true| true| true| true| true| true|
| true| true| true| true| true| true| true| true| true| true|
| true| true| true| true| true| true| true| true| true| true|
| true| true| true| true| true| true| true| true| true| true|
|false|false|false|false|false|false|false|false|false|false|
|false|false|false|false|false|false|false|false|false|false|
| true| true| true| true| true| true| true| true| true| true|
|false|false|false|false|false|false|false|false|false|false|
| true| true| true| true| true| true| true| true| true| true|
|false|false|false|false|false|false|false|false|false|false|
| true| 

In [11]:
# Extract features from timestamps
from pyspark.sql.functions import year, month, dayofmonth

# Select only the extracted features without modifying the original DataFrame
extracted_features_df = df.select(year(df['created_at']).alias('year'),
                                  month(df['created_at']).alias('month'),
                                  dayofmonth(df['created_at']).alias('day'))

# Display the extracted features
extracted_features_df.show()

+----+-----+----+
|year|month| day|
+----+-----+----+
|2020|    3|   4|
|null| null|null|
|null| null|null|
|null| null|null|
|null| null|null|
|null| null|null|
|null| null|null|
|2020|    3|   4|
|2020|    3|   4|
|null| null|null|
|2020|    3|   4|
|null| null|null|
|2020|    3|   4|
|null| null|null|
|null| null|null|
|null| null|null|
|null| null|null|
|null| null|null|
|2020|    3|   4|
|null| null|null|
+----+-----+----+
only showing top 20 rows



In [12]:
from pyspark.sql import functions as F

# List of columns in your DataFrame
columns_to_aggregate = ['user_id', 'status_id', 'status_url', 'created_at', 'screen_name', 'text', 'source', 'is_quote', 'is_retweet', 'retweet_count']

# Iterate over each column for aggregation
for col in columns_to_aggregate:
    print(f" *** Aggregation for {col} ***")
    df.groupBy(col).count().orderBy(F.col('count').desc()).show(truncate=False)

 *** Aggregation for user_id ***


                                                                                

+----------------------------------------------------------------------------------------------------------------------------+-----+
|user_id                                                                                                                     |count|
+----------------------------------------------------------------------------------------------------------------------------+-----+
|#coronavirus"                                                                                                               |2998 |
|#coronavirus                                                                                                                |2063 |
|#coronavirus #DontBeASpreader https://t.co/Hqhc4fFXbe"                                                                      |1544 |
|#coronavirus                                                                                                                |1241 |
|@thespybrief                                                        

                                                                                

+----------------------------------------------------------------------------------------------------------+------+
|status_id                                                                                                 |count |
+----------------------------------------------------------------------------------------------------------+------+
|null                                                                                                      |176567|
|Twitter for iPhone                                                                                        |49344 |
|Twitter for Android                                                                                       |41677 |
|Twitter Web App                                                                                           |28802 |
|Twitter for iPad                                                                                          |8031  |
|TweetDeck                                                              

                                                                                

+--------------------------------------------------------------------------------------------------------------------------------------------+------+
|status_url                                                                                                                                  |count |
+--------------------------------------------------------------------------------------------------------------------------------------------+------+
|null                                                                                                                                        |207754|
|FALSE                                                                                                                                       |131545|
|Twitter for iPhone                                                                                                                          |4367  |
|Twitter for Android                                                                                

                                                                                

+---------------------------------------------------+------+
|created_at                                         |count |
+---------------------------------------------------+------+
|null                                               |213933|
|TRUE                                               |115212|
|FALSE                                              |29632 |
|Twitter for iPhone                                 |1285  |
|Twitter for Android                                |1081  |
|Twitter Web App                                    |593   |
| why did they build *two 1                         |244   |
| like he's done to prevent the spread of #COVID19."|179   |
|Twitter for iPad                                   |170   |
| working parents                                   |144   |
| only of 46 were tested.                           |121   |
| have died""-@DrTedros #COVID19 #coronavirus"      |80    |
| and support for food stamps &amp; unemployment.   |58    |
| neighbours.. https://t

                                                                                

+-----------+------+
|screen_name|count |
+-----------+------+
|null       |215511|
|0          |12741 |
|TRUE       |10056 |
|1          |5332  |
|FALSE      |4177  |
|2          |3366  |
|3          |2442  |
|4          |1960  |
|5          |1702  |
|6          |1380  |
|190166     |1161  |
|8          |1150  |
|7          |1138  |
|10         |957   |
|9          |885   |
|12         |773   |
|11         |770   |
|13         |729   |
|14         |711   |
|16         |599   |
+-----------+------+
only showing top 20 rows

 *** Aggregation for text ***


                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|text                                                                                                                                                                                                                                                                                                            |count |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|null                                                     

                                                                                

+----------------------------------------+------+
|source                                  |count |
+----------------------------------------+------+
|null                                    |507498|
|Twitter for iPhone                      |65180 |
|Twitter for Android                     |41683 |
|Twitter Web App                         |28857 |
|Twitter for iPad                        |8259  |
|Twitter Web Client                      |1992  |
|TweetDeck                               |1790  |
|TRUE                                    |693   |
|Hootsuite Inc.                          |540   |
|Instagram                               |518   |
|FALSE                                   |401   |
| I don't call"" https://t.co/PnKXNAQFzC"|384   |
|Buffer                                  |338   |
|Tweetbot for iΟS                        |332   |
|dlvr.it                                 |211   |
| politicians are reassuring             |195   |
|0                                       |179   |


                                                                                

+------------------------------------------------------------------------------------------------------------------+------+
|is_quote                                                                                                          |count |
+------------------------------------------------------------------------------------------------------------------+------+
|null                                                                                                              |512806|
|FALSE                                                                                                             |148423|
|TRUE                                                                                                              |4595  |
|Twitter for iPhone                                                                                                |746   |
|Twitter for Android                                                                                               |599   |
|Twitter

                                                                                

+-----------------------------------------------------------------------+------+
|is_retweet                                                             |count |
+-----------------------------------------------------------------------+------+
|null                                                                   |514324|
|TRUE                                                                   |119619|
|FALSE                                                                  |35079 |
|Twitter for iPhone                                                     |377   |
|Twitter for Android                                                    |315   |
|Twitter Web App                                                        |299   |
|4264                                                                   |143   |
| directly addressing children during his Sunday update on #coronavirus.|77    |
|Twitter for iPad                                                       |66    |
| or unjustified acceptance 

[Stage 44:>                                                         (0 + 2) / 2]

+-------------+------+
|retweet_count|count |
+-------------+------+
|null         |514910|
|0            |25274 |
|1            |7929  |
|2            |4363  |
|3            |3017  |
|4            |2357  |
|5            |1831  |
|TRUE         |1728  |
|6            |1717  |
|FALSE        |1400  |
|7            |1280  |
|8            |1257  |
|9            |1129  |
|11           |934   |
|10           |925   |
|12           |852   |
|14           |832   |
|13           |799   |
|60797        |764   |
|101721       |729   |
+-------------+------+
only showing top 20 rows



                                                                                

In [13]:
# Filter tweets with more than 100 retweets
filtered_df2 = df.filter(df['retweet_count'] > 100)
filtered_df2.show()

+-------------------+-------------------+--------------------+--------------------+--------------+--------------------+-------------------+--------+----------+-------------+
|            user_id|          status_id|          status_url|          created_at|   screen_name|                text|             source|is_quote|is_retweet|retweet_count|
+-------------------+-------------------+--------------------+--------------------+--------------+--------------------+-------------------+--------+----------+-------------+
|         1694554160|1235354333602107393|https://twitter.c...|2020-03-04T23:59:59Z| mariomoraes51|Find out how #ICT...| Twitter for iPhone|   FALSE|      TRUE|          422|
| 742773886270476288|1235352880485294080|https://twitter.c...|2020-03-04T23:54:13Z|NotAnotherPoll|Good news — I jus...|    Twitter Web App|   FALSE|      TRUE|         1676|
|            8132402|1235354330275827712|https://twitter.c...|2020-03-04T23:59:58Z| michaelturton|Just spoke with E...|    Twitter

In [14]:
# Filter tweets that are retweets
filtered_df3 = df.filter(df['is_retweet'] == True)
filtered_df3.show()

+-------------------+-------------------+--------------------+--------------------+--------------+--------------------+-------------------+--------+----------+-------------+
|            user_id|          status_id|          status_url|          created_at|   screen_name|                text|             source|is_quote|is_retweet|retweet_count|
+-------------------+-------------------+--------------------+--------------------+--------------+--------------------+-------------------+--------+----------+-------------+
|         1694554160|1235354333602107393|https://twitter.c...|2020-03-04T23:59:59Z| mariomoraes51|Find out how #ICT...| Twitter for iPhone|   FALSE|      TRUE|          422|
| 742773886270476288|1235352880485294080|https://twitter.c...|2020-03-04T23:54:13Z|NotAnotherPoll|Good news — I jus...|    Twitter Web App|   FALSE|      TRUE|         1676|
|           24959025|1235354331718664193|https://twitter.c...|2020-03-04T23:59:59Z|bakersfieldnow|Tune in tonight a...|    Twitter

In [15]:
# Sort tweets by retweet count in descending order
sorted_df = df.orderBy(df['retweet_count'].desc())
sorted_df.show(5)



+------------------+-------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------+--------------------+--------------------+
|           user_id|          status_id|          status_url|          created_at|screen_name|                text|              source|is_quote|          is_retweet|       retweet_count|
+------------------+-------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------+--------------------+--------------------+
|         388195837|1236799237989584896|https://twitter.c...|2020-03-08T23:41:31Z| samadbeygi|"Richard Brennan ...|               gowns|  gloves| &amp; bc there'r...|they also need mo...|
|         221795726|1238614814706016271|https://twitter.c...|2020-03-13T23:55:58Z|   paolorho|"@BorisJohnson Si...| has to get coron...| at best| with a mortality...|at least 400.000 ...|
|                WI|                 MN|                  KY

                                                                                

In [16]:
# Sort tweets by creation time in ascending order
sorted_df2 = df.orderBy('created_at')
sorted_df2.show(5)



+--------------------+---------+----------+----------+-----------+----+------+--------+----------+-------------+
|             user_id|status_id|status_url|created_at|screen_name|text|source|is_quote|is_retweet|retweet_count|
+--------------------+---------+----------+----------+-----------+----+------+--------+----------+-------------+
|Where are the tests?|     null|      null|      null|       null|null|  null|    null|      null|         null|
| 5. Son (20 yrs old)|     null|      null|      null|       null|null|  null|    null|      null|         null|
|-The payments wil...|     null|      null|      null|       null|null|  null|    null|      null|         null|
|-Containment stil...|     null|      null|      null|       null|null|  null|    null|      null|         null|
|If the D Party wa...|     null|      null|      null|       null|null|  null|    null|      null|         null|
+--------------------+---------+----------+----------+-----------+----+------+--------+---------

                                                                                

In [17]:
# Pivot based on source column and aggregate on retweet count
pivot_df = df.groupBy('source').pivot('is_retweet').agg(F.count('retweet_count'))
pivot_df

                                                                                

DataFrame[source: string, null: bigint,   #WuhanLockDown : bigint,  "" If not every city will get hit like #NYC then this seems even more of a reason to send more ventilators to New York.  #coronavirus https://t.co/eRq0WGr9Gf": bigint,  ""In the Heights"": bigint,  ""quarantini."": bigint,  #CounterAssistants...the backlash will be immense. Your choice @BorisJohnson @MattHancock.  #WeArePharmacy #CoronaCrisis #Covid_19 #Coronavirus https://t.co/DVG21d28bt": bigint,  #ElPaso: bigint,  #Heilongjiang Province #China. Looks like they are sealing off taxies. No new cases?: bigint,  #NewStart: bigint,  #Ozzy: bigint,  #PressBriefing": bigint,  #Roseville: bigint,  #Science https://t.co/L4lBft8FZN": bigint,  #StayHomeSaveLives : bigint,  #WeLoveNurses: bigint,  #coronavirus toilet paper shortage. We went outside and had to use trees and plant leaves!!.. Dad? Yes child? What are trees?... Sorry child: bigint,  #coronavirus": bigint,  &amp; #Taiwan stands ready to share its #Coronavirus know-ho

In [18]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
from wordcloud import WordCloud
import string
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message="Downloading package") 

2024-03-25 23:00:54.296539: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 23:00:54.429859: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-25 23:00:54.975388: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
pandas_df = df.toPandas()
df = pandas_df

                                                                                

In [20]:
df.shape

(671515, 10)

In [21]:
null_counts = df.isnull().sum()
null_counts

user_id               7
status_id        176567
status_url       207754
created_at       213933
screen_name      215511
text             350063
source           507498
is_quote         512806
is_retweet       514324
retweet_count    514910
dtype: int64

In [22]:
# Fill null values with a specific value 
df_filled = df.fillna(0)
df_filled

Unnamed: 0,user_id,status_id,status_url,created_at,screen_name,text,source,is_quote,is_retweet,retweet_count
0,750502701457440768,1235354334923276289,https://twitter.com/RocketMan6510/status/12353...,2020-03-04T23:59:59Z,RocketMan6510,Here are the 6 confirmed #coronavirus cases in...,0,0,0,0
1,1. Health care worker who traveled to Iran (39...,0,0,0,0,0,0,0,0,0
2,2. Westchester attorney (50 yrs old). Hospital...,0,0,0,0,0,0,0,0,0
3,3. His wife,0,0,0,0,0,0,0,0,0
4,4. Daughter (14 yrs old),0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
671510,46988738,1243682163150618624,https://twitter.com/valadon2/status/1243682163...,2020-03-27T23:31:48Z,valadon2,'The essential thing was to save the greatest ...,0,0,0,0
671511,Albert Camus,"'The Plague' #coronavirus""",Twitter for iPad,FALSE,TRUE,16,0,0,0,0
671512,1235576604488650752,1243682163117023232,https://twitter.com/bakura_yamimode/status/124...,2020-03-27T23:31:48Z,bakura_yamimode,"Although we are not sitting together, be rest ...",0,0,0,0
671513,The novel #coronavirus does not recognise borders,thus we must rise to this challenge as nations,"united. https://t.co/vBuUJnDNnU""",Twitter for Android,FALSE,TRUE,72,0,0,0


In [23]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671515 entries, 0 to 671514
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        671508 non-null  object
 1   status_id      494948 non-null  object
 2   status_url     463761 non-null  object
 3   created_at     457582 non-null  object
 4   screen_name    456004 non-null  object
 5   text           321452 non-null  object
 6   source         164017 non-null  object
 7   is_quote       158709 non-null  object
 8   is_retweet     157191 non-null  object
 9   retweet_count  156605 non-null  object
dtypes: object(10)
memory usage: 51.2+ MB
None


In [24]:
print("Shape of the dataset:", df.shape)

Shape of the dataset: (671515, 10)


In [25]:
print("Data types of each column:")
print(df.dtypes)

Data types of each column:
user_id          object
status_id        object
status_url       object
created_at       object
screen_name      object
text             object
source           object
is_quote         object
is_retweet       object
retweet_count    object
dtype: object
