# Twitter Sentiment Analysis for the word rugby

Connect Spark to Mongo DB

In [155]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars "/usr/local/spark/jars/mongo-spark-connector_2.12-3.0.2.jar,/usr/local/spark/jars/mongo-java-driver-3.12.9.jar" pyspark-shell'

Note the inferschema is set to false, this makes spark read the entire database and not infer the values of fields from the first set of fields

In [156]:
from pyspark.sql import SparkSession
# fix read bug, basically turn off sampling
spark = SparkSession.builder.appName("TwitterMongo") \
.config("spark.mongodb.input.database", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.input.uri", "mongodb://localhost:27017/twitter.tweets") \
.config("spark.mongodb.read.sql.inferSchema.mapTypes.enabled", "FALSE") \
.config("spark.mongodb.output.uri","mongodb://localhost:27017/twitter.tweets").getOrCreate()

### Create the Session

And load all of the Twitter data in MongoDB

Print out the twitter tweet schema

In [157]:
# create a spark session
spark = SparkSession \
.builder \
.master("local") \
.appName("ABC") \
.config("spark.driver.memory", "15g") \
.config("spark.mongodb.read.connection.uri", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.write.connection.uri", "mongodb://localhost:27017/twitter") \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:2.12-3.0.2') \
.getOrCreate()
# read data from mongodb collection "questions" into a dataframe "df"
df = spark.read \
.format("com.mongodb.spark.sql.DefaultSource") \
.option("uri", "mongodb://localhost:27017/twitter") \
.option("database", "twitter") \
.option("collection", "tweets") \
.load()
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- contributors: void (nullable = true)
 |-- coordinates: void (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: integer (containsNull = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (

In [158]:
# set up hadoop folder structure

!hadoop fs -test -d /ca4 && hadoop fs -rm -f -R /ca4
!hadoop fs -mkdir /ca4


Deleted /ca4


Create a spark object of the tweets held in the mongo db 

It is easier to use SQL statements and Pyspark to clean the data rather than writing queries in MongoDB

In [159]:
df.createOrReplaceTempView("tweets")

How many tweets in the DB all together

In [160]:
df = spark.sql("SELECT DISTINCT id FROM tweets WHERE text LIKE '%rugby%'")
df.count()

                                                                                

72807

How many tweets by language 


In [161]:
#pip install plotly
%pip install kaleido
import kaleido
import pyspark.pandas as ps
import plotly
dfLang = spark.sql("SELECT DISTINCT lang, CAST(count(id) AS INT) as TweetCount FROM tweets \
            WHERE text LIKE '%rugby%' \
            GROUP BY lang \
            ORDER BY TweetCount DESC LIMIT 10")
dfLang.show()

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.




+----+----------+
|lang|TweetCount|
+----+----------+
|  en|     48892|
|  fr|      8450|
|  es|      6754|
|  ja|      4105|
| und|      1618|
|  pt|       614|
|  it|       480|
|  tl|       309|
|  ca|       274|
|  in|       251|
+----+----------+



                                                                                

In [162]:
tempdf = ps.DataFrame(dfLang)

fig = tempdf.plot(kind='bar', x='lang', y='TweetCount')
fig.show()

fig.write_image("Images/TweetCountByLanguage.svg")
fig.write_html("Images/TweetCountByLanguage.html")

2023-05-18 18:00:27,986 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:00:27,986 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:00:45,800 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:00:45,800 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

How many tweets by location

In [163]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM tweets \
                   WHERE text LIKE '%rugby%' \
                   GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()



+--------------------+----------+
|            Location|TweetCount|
+--------------------+----------+
|                null|     24804|
|     London, England|       543|
|      United Kingdom|       529|
|              France|       503|
|     Kampala, Uganda|       500|
|            Alicante|       462|
|              London|       426|
|             Ireland|       374|
|        South Africa|       348|
|Wales, United Kin...|       338|
+--------------------+----------+



                                                                                

In [164]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM tweets \
                   WHERE text LIKE '%rugby%' AND user.location IS NOT Null \
                   GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
tempdf = ps.DataFrame(dfLoc)

fig = tempdf.plot(kind='bar', x='Location', y='TweetCount')
fig.show()


fig.write_image("Images/TweetCountByuserLocation.svg")
fig.write_html("Images/TweetCountByuserLocation.html")

2023-05-18 18:01:03,720 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:01:03,721 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:01:21,303 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:01:21,304 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Now limit the dataset to English texts and tweets with the rugby in the text

In [165]:
dfEnTwt = spark.sql("SELECT * FROM tweets WHERE lang = 'en' AND text LIKE '%rugby%'")
dfEnTwt.createOrReplaceTempView("en_tweets")
dfEnTwt.show()

2023-05-18 18:01:35,317 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1311.2 KiB
[Stage 715:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------+--------------------+------------------+--------------------+--------------------+--------------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+--------------------+-------------------+--------------------+-----------------------+-----------+-------------+---------+--------------------+--------------------+--------------------+-------------+---------+--------------------+
|                 _id|contributors|coordinates|          created_at|display_text_range|            entities|   extended_entities|      extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_user_id_str|is_quote_stat

                                                                                

Now how many tweets in the English language dataset

In [166]:
dfEnTwt.count()

                                                                                

48892

In [167]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount \
                   FROM en_tweets GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()



+--------------------+----------+
|            Location|TweetCount|
+--------------------+----------+
|                null|     15507|
|      United Kingdom|       518|
|     London, England|       514|
|     Kampala, Uganda|       467|
|              London|       410|
|             Ireland|       356|
|        South Africa|       327|
|Wales, United Kin...|       315|
|England, United K...|       301|
|Cape Town, South ...|       264|
+--------------------+----------+



                                                                                

In [168]:

dfDay= spark.sql("SELECT DISTINCT CAST(substring(created_at, 27, 4) AS INT) as Year, \
          CAST(from_unixtime(unix_timestamp(substring(created_at, 5, 3), 'MMM'), 'MM') As INT) as Month, \
          CAST(substring(created_at, 9, 2) AS INT) as Day, \
          CAST(count(id) AS INT) as TweetCount \
          FROM en_tweets \
          WHERE lang = 'en' AND text LIKE '%rugby%' GROUP BY substring(created_at, 27, 4), \
          substring(created_at, 5, 3), \
          substring(created_at, 9, 2)")

dfDay.createOrReplaceTempView("tweetsByDay")

dfDay = spark.sql("SELECT CONCAT(Year, '_', Month, '_', DAY) AS Date, TweetCount  \
                    FROM tweetsByDay ORDER BY Year, Month, Day")

dfDay.show()




+---------+----------+
|     Date|TweetCount|
+---------+----------+
|2021_1_29|         1|
| 2021_2_1|        69|
| 2021_2_2|        73|
| 2021_2_3|        51|
| 2021_2_4|        67|
| 2021_2_5|        88|
| 2021_2_6|       266|
| 2021_2_7|       254|
| 2021_2_8|       139|
| 2021_2_9|       112|
|2021_2_10|        86|
|2021_2_11|        75|
|2021_2_12|        96|
|2021_2_13|       184|
|2021_2_14|       129|
|2021_2_15|        59|
|2021_2_16|        65|
|2021_2_17|        67|
|2021_2_18|        72|
|2021_2_19|       112|
+---------+----------+
only showing top 20 rows



                                                                                

In [169]:
tempdf = ps.DataFrame(dfDay)

fig = tempdf.plot(kind='bar', x='Date', y='TweetCount')
fig.show()


fig.write_image("Images/TweetCountByDay.svg")
fig.write_html("Images/TweetCountByDay.html")

2023-05-18 18:02:35,589 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:02:35,590 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:02:53,032 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:02:53,090 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:02:53,163 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:02:53,163 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [170]:
dfWeek = spark.sql("SELECT Year, weekofyear(make_date(Year, Month, Day)) as wkofYr , SUM(TweetCount) as TweetCountbyWeek \
                   FROM tweetsByDay \
                   GROUP BY Year, weekofyear(make_date(Year, Month, Day))")

dfWeek.createOrReplaceTempView("tweetsByWeek")
dfWeek = spark.sql("SELECT Year, wkofYr, CONCAT(Year, '_', wkofYr) AS yr_wk, TweetCountbyWeek  \
                   FROM tweetsByWeek ORDER BY Year, wkofYr")
dfWeek.show()



+----+------+-------+----------------+
|Year|wkofYr|  yr_wk|TweetCountbyWeek|
+----+------+-------+----------------+
|2021|     4| 2021_4|               1|
|2021|     5| 2021_5|             868|
|2021|     6| 2021_6|             821|
|2021|     7| 2021_7|             632|
|2021|     8| 2021_8|             701|
|2021|     9| 2021_9|             645|
|2021|    10|2021_10|             721|
|2021|    11|2021_11|             869|
|2021|    12|2021_12|             800|
|2021|    13|2021_13|             616|
|2021|    14|2021_14|             381|
|2021|    17|2021_17|             378|
|2021|    18|2021_18|             595|
|2021|    19|2021_19|             488|
|2021|    20|2021_20|             496|
|2021|    21|2021_21|             461|
|2021|    22|2021_22|             420|
|2021|    23|2021_23|             404|
|2021|    24|2021_24|             509|
|2021|    25|2021_25|             472|
+----+------+-------+----------------+
only showing top 20 rows



                                                                                

In [171]:
tempdf = ps.DataFrame(dfWeek)

fig = tempdf.plot(kind='bar', x='yr_wk', y='TweetCountbyWeek')
fig.show()


fig.write_image("Images/TweetCountByWeek.svg")
fig.write_html("Images/TweetCountByWeek.html")

2023-05-18 18:03:10,347 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:10,348 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:27,654 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:27,692 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:27,720 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:27,772 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [172]:
# get the tweet count by month
dfMonth = spark.sql("SELECT Year, Month, SUM(TweetCount) as TweetCountbyMonth \
                     FROM tweetsByDay GROUP BY Year, Month")
dfMonth.createOrReplaceTempView("tweetsByMonth")
dfMonth = spark.sql("SELECT Year, Month, CONCAT(Year, '_', Month) AS MonthYr, TweetCountbyMonth  \
                     FROM tweetsByMonth ORDER BY Year, Month ")
dfMonth.show()



+----+-----+-------+-----------------+
|Year|Month|MonthYr|TweetCountbyMonth|
+----+-----+-------+-----------------+
|2021|    1| 2021_1|                1|
|2021|    2| 2021_2|             3022|
|2021|    3| 2021_3|             3298|
|2021|    4| 2021_4|              981|
|2021|    5| 2021_5|             2222|
|2021|    6| 2021_6|             1935|
|2021|    7| 2021_7|             2366|
|2021|    8| 2021_8|             2102|
|2021|    9| 2021_9|             2447|
|2021|   10|2021_10|             2553|
|2021|   11|2021_11|             2440|
|2021|   12|2021_12|             1880|
|2022|    1| 2022_1|              450|
|2022|    2| 2022_2|              824|
|2022|    3| 2022_3|             2605|
|2022|    4| 2022_4|             2614|
|2022|    5| 2022_5|             2487|
|2022|    6| 2022_6|             2235|
|2022|    7| 2022_7|             2844|
|2022|    8| 2022_8|             2271|
+----+-----+-------+-----------------+
only showing top 20 rows



                                                                                

In [173]:
tempdf = ps.DataFrame(dfMonth)

fig = tempdf.plot(kind='bar', x='MonthYr', y='TweetCountbyMonth')
fig.show()
fig.write_image("Images/TweetCountByMonth.svg")
fig.write_html("Images/TweetCountByMonth.html")

2023-05-18 18:03:46,630 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:03:46,631 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:04:03,880 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:04:03,943 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:04:03,974 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-18 18:04:04,015 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

### Next up is text clean up 

In [174]:
#%pip install wordcloud
#%pip install vadersentiment
## sentiment analysis ref: https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/?ref=lbp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    print("Overall sentiment dictionary is : ", sentiment_dict)
    #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
 
    elif sentiment_dict['compound'] <= - 0.05 :
       print("Negative")
 
    else :
        print("Neutral")
    
    return sentiment_dict['compound']

In [175]:
dfText = spark.sql("SELECT DISTINCT id, text AS text FROM en_tweets")
dfText.createOrReplaceTempView("text")
dfText.show()
dfText.count()


                                                                                

+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1444274428221734923|What a game of ru...|
|1444187035678425092|RT @ForcesNews: W...|
|1477502056814444544|why are rugby lea...|
|1444181687936688129|RT @FindinLangued...|
|1576543830710493184|RT @T2Rugby: The ...|
|1477389703984865287|RT @labour_histor...|
|1477323861788086273|RT @rugby_sport_x...|
|1477243301812183043|RT @scarlets_rugb...|
|1430144262642061313|RT @premrugby: We...|
|1478080333912612865|@AdamWarwick86 @i...|
|1444634089756168199|@rugby_ap @ColmGr...|
|1577002075170357248|RT @raysrugby: I'...|
|1576920479192915969|The Women's @rugb...|
|1511701614725320710|RT @MattHardyJour...|
|1477194664642072577|RT @berwickrugby:...|
|1444399334578032646|RT @therugbynetwo...|
|1477681992468733955|RT @premrugby: Is...|
|1356290118911483915|English rugby can...|
|1576619382695788544|@Dermotom @malahi...|
|1444582998972657671|RT @FreemanrugbyJ...|
+----------

                                                                                

48726

Count the number of words in the tweets 

In [176]:
from pyspark.sql.functions import * 
from pyspark.sql.types import StringType, ArrayType
# heavy reliance on SQL functions in the following code

dfWord = dfText.withColumn("Word", explode(split(col("text"), ' '))).groupBy("Word").count().orderBy(desc("count"))

dfWord.show()



+-----+-----+
| Word|count|
+-----+-----+
|  the|25866|
|   RT|24751|
|rugby|24389|
|   to|16333|
|    a|14673|
|   in|11306|
|   of|11130|
|  and|10737|
|  for| 8756|
|   is| 7034|
|   on| 5671|
|    I| 5522|
|   at| 4442|
|  you| 4143|
| this| 4107|
| with| 4051|
|     | 3820|
| that| 3302|
|  are| 3197|
| from| 3192|
+-----+-----+
only showing top 20 rows



                                                                                

Count the number of characters including spaces

In [177]:
dfChar = spark.sql("SELECT text, LENGTH(text) AS char FROM text ORDER BY char DESC")
dfChar.show(5)



+--------------------+----+
|                text|char|
+--------------------+----+
|RT @Jabu_Macdonal...| 163|
|RT @alt_11013: Ka...| 152|
|RT @OfficialKRU: ...| 152|
|RT @SportsNewsSLi...| 152|
|RT @OfficialKRU: ...| 152|
+--------------------+----+
only showing top 5 rows



                                                                                

Check for special characters i.e. Hashtags

In [178]:
dfSpecChar = spark.sql("SELECT text, regexp_extract_all(text, '(#\\\\w+)', 1) AS Hashtags \
                        FROM text WHERE text like '%#%' ")
dfSpecChar.show()

[Stage 817:>                                                        (0 + 1) / 1]

+--------------------+--------------------+
|                text|            Hashtags|
+--------------------+--------------------+
|RT @ForcesNews: W...|            [#rugby]|
|RT @loverugbyleag...|[#RugbyLeague, #IMG]|
|RT @leinsterrugby...|    [#LEIvZEB, #URC]|
|RT @rugbyworldcup...|              [#OTD]|
|The latest The iS...|[#rugby, #tokyo2020]|
|Check out my rece...|[#england, #rugby...|
|RT @WomenBoks: Wh...|  [#TogetherMovingF]|
|@DoveMenUK @Scotl...|            [#AsOne]|
|RT @Pac12Network:...|      [#NeverForget]|
|#rugby history Bo...|            [#rugby]|
|RT @ultimaterugby...|          [#USAvNZL]|
|RT @wrugbymuseum:...|        [#OnThisDay]|
|RT @kcbrugby: Wak...|[#believe, #commi...|
|@_jodieounsley ha...|[#womensrugby, #w...|
|Great to see so m...|             [#BFEO]|
|RT @btsportrugby:...|    [#GallagherPrem]|
|RT @KobsrugbyUg: ...|[#MilekeShield, #...|
|RT @WalesRugbyL: ...|      [#rugbyleague]|
|I’m I’m of these ...|[#rugby, #footbal...|
|RT @piratesrugbyU...|[#PiratesS

                                                                                

Check for upper case 

In [179]:
dfUpper = spark.sql("SELECT text FROM text WHERE translate(text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = ''")
dfUpper.show()




+----+
|text|
+----+
+----+



                                                                                

Check for numbers 



In [180]:
dfNum = spark.sql("SELECT text FROM text WHERE translate(text, '0123456789', '') <> text")
dfNum.show()



+--------------------+
|                text|
+--------------------+
|why are rugby lea...|
|RT @FindinLangued...|
|RT @T2Rugby: The ...|
|RT @rugby_sport_x...|
|RT @scarlets_rugb...|
|@AdamWarwick86 @i...|
|@rugby_ap @ColmGr...|
|The Women's @rugb...|
|So football as en...|
|RT @BorkThunder: ...|
|@ShaneWilliams11 ...|
|@MoistenedTart @J...|
|RT @MattHardyJour...|
|RT @berwickrugby:...|
|RT @therugbynetwo...|
|RT @premrugby: Is...|
|English rugby can...|
|@Dermotom @malahi...|
|RT @HowdleSi: Fan...|
|Tag Coaches Corne...|
+--------------------+
only showing top 20 rows



                                                                                

This check does not really advance our understanding of the data, a lot of twitter names have numbers in them

In [181]:
dfNum.count()

                                                                                

29132

Leave only text in the strings, all non alpha numeric characters are removed with the application of the regular expression 

In [182]:

pattern = r'[^a-zA-Z0-9\s]'

dfText = spark.sql("SELECT id, text AS orignialText, text FROM text")
dfText = dfText.withColumn("text", regexp_replace('text', pattern, ''))
dfText.take(5)

                                                                                

[Row(id=1444274428221734923, orignialText='What a game of rugby. Top quality from both teams. So much better than last week 😌', text='What a game of rugby Top quality from both teams So much better than last week '),
 Row(id=1444187035678425092, orignialText="RT @ForcesNews: We've got more LIVE #rugby for you tomorrow. 🏉\n\n@ArmyRugbyLeague 🆚 @RAFRugbyLeague\n\n✅ Two matches\n\nWatch live on our Facebo…", text='RT ForcesNews Weve got more LIVE rugby for you tomorrow \n\nArmyRugbyLeague  RAFRugbyLeague\n\n Two matches\n\nWatch live on our Facebo'),
 Row(id=1477502056814444544, orignialText='why are rugby league players to consistently stupid? Blake Ferguson getting caught with cocaine in Japan is yet ano… https://t.co/2V396Qztle', text='why are rugby league players to consistently stupid Blake Ferguson getting caught with cocaine in Japan is yet ano httpstco2V396Qztle'),
 Row(id=1444181687936688129, orignialText="RT @FindinLanguedoc: Narbonne. Women's rugby in full swing (Finding Langue

In [183]:
dfText.createOrReplaceTempView("text") # this is cleaned txt
dfText = spark.sql("SELECT id, orignialText, LOWER(TRIM(text)) AS text FROM text")

dfText.take(5)

                                                                                

[Row(id=1444274428221734923, orignialText='What a game of rugby. Top quality from both teams. So much better than last week 😌', text='what a game of rugby top quality from both teams so much better than last week'),
 Row(id=1444187035678425092, orignialText="RT @ForcesNews: We've got more LIVE #rugby for you tomorrow. 🏉\n\n@ArmyRugbyLeague 🆚 @RAFRugbyLeague\n\n✅ Two matches\n\nWatch live on our Facebo…", text='rt forcesnews weve got more live rugby for you tomorrow \n\narmyrugbyleague  rafrugbyleague\n\n two matches\n\nwatch live on our facebo'),
 Row(id=1477502056814444544, orignialText='why are rugby league players to consistently stupid? Blake Ferguson getting caught with cocaine in Japan is yet ano… https://t.co/2V396Qztle', text='why are rugby league players to consistently stupid blake ferguson getting caught with cocaine in japan is yet ano httpstco2v396qztle'),
 Row(id=1444181687936688129, orignialText="RT @FindinLanguedoc: Narbonne. Women's rugby in full swing (Finding Langued

In [184]:
#now build a word cloud
from wordcloud import WordCloud, STOPWORDS
dfText.createOrReplaceTempView("text") # this is the trimmed and lowercase txt
dfText = spark.sql("SELECT id, orignialText, text FROM text")
dfText.show(5)



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1444274428221734923|What a game of ru...|what a game of ru...|
|1444187035678425092|RT @ForcesNews: W...|rt forcesnews wev...|
|1477502056814444544|why are rugby lea...|why are rugby lea...|
|1444181687936688129|RT @FindinLangued...|rt findinlanguedo...|
|1576543830710493184|RT @T2Rugby: The ...|rt t2rugby the st...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Fix the spelling

In [185]:
#%pip install textblob
from textblob import TextBlob

dfText = spark.sql("SELECT id, orignialText, text FROM text")
#dfText = dfText.withColumn("newtext", col(TextBlob("text").correct()))

dfText.show(5)
dfText.createOrReplaceTempView("text") # this is the preprocessed SQL Style txt



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1477389703984865287|RT @labour_histor...|rt labourhistory ...|
|1477323861788086273|RT @rugby_sport_x...|rt rugbysportxx  ...|
|1477243301812183043|RT @scarlets_rugb...|rt scarletsrugby ...|
|1511701614725320710|RT @MattHardyJour...|rt matthardyjourn...|
|1477194664642072577|RT @berwickrugby:...|rt berwickrugby w...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [186]:



dfText = spark.sql("SELECT * FROM text")

dfText.show(5)



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1477389703984865287|RT @labour_histor...|rt labourhistory ...|
|1477323861788086273|RT @rugby_sport_x...|rt rugbysportxx  ...|
|1477243301812183043|RT @scarlets_rugb...|rt scarletsrugby ...|
|1511701614725320710|RT @MattHardyJour...|rt matthardyjourn...|
|1477194664642072577|RT @berwickrugby:...|rt berwickrugby w...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

### Tokenize and Stem the tweets

In [187]:
#%pip install nltk

In [188]:
# stackoverflow ref: https://stackoverflow.com/questions/53579444

from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from nltk.stem.snowball import SnowballStemmer

In [189]:
# Tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
dfTextTok = tokenizer.transform(dfText).select("text","tokens")
dfText = dfText.join(dfTextTok, on=['text'], how='left_outer')
dfText.show(5)



+--------------------+-------------------+--------------------+--------------------+
|                text|                 id|        orignialText|              tokens|
+--------------------+-------------------+--------------------+--------------------+
|diddlyone kazimkz...|1429767303747489792|@DIDDLYONE @kazim...|[diddlyone, kazim...|
|rt rugbysportxx  ...|1477323861788086273|RT @rugby_sport_x...|[rt, rugbysportxx...|
|rt rugbysportxx  ...|1477323861788086273|RT @rugby_sport_x...|[rt, rugbysportxx...|
|rt rugbysportxx  ...|1477323861788086273|RT @rugby_sport_x...|[rt, rugbysportxx...|
|rt rugbysportxx  ...|1477323861788086273|RT @rugby_sport_x...|[rt, rugbysportxx...|
+--------------------+-------------------+--------------------+--------------------+
only showing top 5 rows





In [190]:
# Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
dfText = remover.transform(dfText).select("id", "text","tokens","filtered")
dfText.show(5)



+-------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|            filtered|
+-------------------+--------------------+--------------------+--------------------+
|1576543830710493184|rt t2rugby the st...|[rt, t2rugby, the...|[rt, t2rugby, sta...|
|1444274428221734923|what a game of ru...|[what, a, game, o...|[game, rugby, top...|
|1444187035678425092|rt forcesnews wev...|[rt, forcesnews, ...|[rt, forcesnews, ...|
|1444181687936688129|rt findinlanguedo...|[rt, findinlangue...|[rt, findinlangue...|
|1477389703984865287|rt labourhistory ...|[rt, labourhistor...|[rt, labourhistor...|
+-------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Now recheck for the most common words and decide if they need to be removed 

In [191]:
#dfWord = dfText.withColumn("Word", explode(split(col("filtered"), ' '))).groupBy("Word").count().orderBy(desc("count").limit(10))

#dfWord.show()

Now look at rare words 

In [192]:
#dfWord = dfText.withColumn("Word", explode(split(col("text"), ' '))).groupBy("Word").count().orderBy(asc("count").limit(10))

#dfWord.show()

Decided to stem the words as per this page https://stackoverflow.com/questions/53579444

In [193]:
# stem the words
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", stemmer_udf("filtered"))
dfText.show(5)

                                                                                

+-------------------+--------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|            filtered|    filtered_stemmed|
+-------------------+--------------------+--------------------+--------------------+--------------------+
|1576543830710493184|rt t2rugby the st...|[rt, t2rugby, the...|[rt, t2rugby, sta...|[rt, t2rugbi, sta...|
|1444274428221734923|what a game of ru...|[what, a, game, o...|[game, rugby, top...|[game, rugbi, top...|
|1356893087864250369|rugbygayy yeah bu...|[rugbygayy, yeah,...|[rugbygayy, yeah,...|[rugbygayi, yeah,...|
|1444187035678425092|rt forcesnews wev...|[rt, forcesnews, ...|[rt, forcesnews, ...|[rt, forcesnew, w...|
|1444181687936688129|rt findinlanguedo...|[rt, findinlangue...|[rt, findinlangue...|[rt, findinlangue...|
+-------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [194]:
# Filter out short words
filterShortWords = udf(lambda row: [x for x in row if len(x) >= 4], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", filterShortWords("filtered_stemmed"))

dfText.show(5)


                                                                                

+-------------------+--------------------+--------------------+--------------------+--------------------+
|                 id|                text|              tokens|            filtered|    filtered_stemmed|
+-------------------+--------------------+--------------------+--------------------+--------------------+
|1477323861788086273|rt rugbysportxx  ...|[rt, rugbysportxx...|[rt, rugbysportxx...|[rugbysportxx, ch...|
|1477323861788086273|rt rugbysportxx  ...|[rt, rugbysportxx...|[rt, rugbysportxx...|[rugbysportxx, ch...|
|1477323861788086273|rt rugbysportxx  ...|[rt, rugbysportxx...|[rt, rugbysportxx...|[rugbysportxx, ch...|
|1477323861788086273|rt rugbysportxx  ...|[rt, rugbysportxx...|[rt, rugbysportxx...|[rugbysportxx, ch...|
|1477323861788086273|rt rugbysportxx  ...|[rt, rugbysportxx...|[rt, rugbysportxx...|[rugbysportxx, ch...|
+-------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [195]:
dfText.count()

                                                                                

110992

In [196]:
# clean up hadoop outplut location
!hadoop fs -rm -R -f /ca4/rugby_twitter.parquet

# write it to hadoop
dfText.coalesce(1).write.parquet("/ca4/rugby_twitter.parquet")

# check if the file was written out
!hadoop fs -ls /ca4/rugby_twitter.parquet

# output to the project folder 
!hadoop fs -get '/ca4/rugby_twitter.parquet' './'

                                                                                

Found 2 items
-rw-r--r--   1 sba22230 supergroup          0 2023-05-18 18:11 /ca4/rugby_twitter.parquet/_SUCCESS
-rw-r--r--   1 sba22230 supergroup   12883520 2023-05-18 18:11 /ca4/rugby_twitter.parquet/part-00000-cd431531-9956-4dea-afee-2766393b332b-c000.snappy.parquet


In [197]:
# select tweet id, geo, lang, quoted_status,quoted_status.geo 
# Having a look at some the data
dfdashboard = spark.sql("SELECT DISTINCT id, CAST(substring(created_at, 27, 4) AS INT) as Year, \
                    CAST(from_unixtime(unix_timestamp(substring(created_at, 5, 3), 'MMM'), 'MM') As INT) as Month, \
                    CAST(substring(created_at, 9, 2) AS INT) as Day,\
                    weekofyear(make_date(CAST(substring(created_at, 27, 4) AS INT),\
                    CAST(from_unixtime(unix_timestamp(substring(created_at, 5, 3), 'MMM'), 'MM') As INT),\
                    CAST(substring(created_at, 9, 2) AS INT))) as wkofYr, \
                    quote_count, reply_count, retweet_count, favorite_count, \
                    lang  \
                    FROM en_tweets")

dfdashboard.show()




+-------------------+----+-----+---+------+-----------+-----------+-------------+--------------+----+
|                 id|Year|Month|Day|wkofYr|quote_count|reply_count|retweet_count|favorite_count|lang|
+-------------------+----+-----+---+------+-----------+-----------+-------------+--------------+----+
|1576199117604372480|2022|   10|  1|    39|          0|          0|            0|             0|  en|
|1444404896208465922|2021|   10|  2|    39|          0|          0|            0|             0|  en|
|1477742977636356098|2022|    1|  2|    52|          0|          0|            0|             0|  en|
|1478691234289967109|2022|    1|  5|     1|          0|          0|            0|             0|  en|
|1431734084213092353|2021|    8| 28|    34|          0|          0|            0|             0|  en|
|1577754675943772164|2022|   10|  5|    40|          0|          0|            0|             0|  en|
|1431658960013062145|2021|    8| 28|    34|          0|          0|            0| 

                                                                                

In [198]:
dfdashboard.printSchema()

root
 |-- id: long (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- wkofYr: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- lang: string (nullable = true)



In [199]:
display(dfdashboard.count())
dfdashboard.coalesce(1).write.parquet("/ca4/dashboard.parquet")

# check if the file was written out
!hadoop fs -ls /ca4/dashboard.parquet


# output to the project folder 
!hadoop fs -get '/ca4/dashboard.parquet' './'

                                                                                

48726

                                                                                

Found 2 items
-rw-r--r--   1 sba22230 supergroup          0 2023-05-18 18:12 /ca4/dashboard.parquet/_SUCCESS
-rw-r--r--   1 sba22230 supergroup     491576 2023-05-18 18:12 /ca4/dashboard.parquet/part-00000-c18540f7-593f-4fb0-b248-bf2787e12643-c000.snappy.parquet


### Advanced Text processing

N-grams

In [200]:


pdfText = dfText.toPandas()


                                                                                

Term Frequency

In [201]:
type(pdfText)

pandas.core.frame.DataFrame

Inverse Document Frequency

## Building the model 