# Twitter Sentiment Analysis for the word Euro

Connect Spark to Mongo DB

In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars "/usr/local/spark/jars/mongo-spark-connector_2.12-3.0.2.jar,/usr/local/spark/jars/mongo-java-driver-3.12.9.jar" pyspark-shell'

Note the inferschema is set to false, this makes spark read the entire database and not infer the values of fields from the first set of fields

In [None]:
from pyspark.sql import SparkSession
# fix read bug, basically turn off sampling
spark = SparkSession.builder.appName("TwitterMongo") \
.config("spark.mongodb.input.database", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.input.uri", "mongodb://localhost:27017/twitter.tweets") \
.config("spark.mongodb.read.sql.inferSchema.mapTypes.enabled", "FALSE") \
.config("spark.mongodb.output.uri","mongodb://localhost:27017/twitter.tweets").getOrCreate()

## Create the Session

And load all of the Twitter data in MongoDB

Print out the twitter tweet schema

In [None]:
# create a spark session
spark = SparkSession \
.builder \
.master("local") \
.appName("ABC") \
.config("spark.driver.memory", "15g") \
.config("spark.mongodb.read.connection.uri", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.write.connection.uri", "mongodb://localhost:27017/twitter") \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:2.12-3.0.2') \
.getOrCreate()
# read data from mongodb collection "questions" into a dataframe "df"
df = spark.read \
.format("com.mongodb.spark.sql.DefaultSource") \
.option("uri", "mongodb://localhost:27017/twitter") \
.option("database", "twitter") \
.option("collection", "tweets") \
.load()
df.printSchema()

In [None]:
df.show()

Create a spark object of the tweets held in the mongo db 

It is easier to use SQL statements and Pyspark to clean the data rather than writing queries in MongoDB

In [None]:
df.createOrReplaceTempView("tweets")

How many tweets in the DB all together

In [None]:
df = spark.sql("SELECT DISTINCT id FROM tweets")
df.count()

How many tweets by language 


In [None]:
#pip install plotly
import pyspark.pandas as ps
import plotly
dfLang = spark.sql("SELECT DISTINCT lang, CAST(count(id) AS INT) as TweetCount FROM tweets GROUP BY lang \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLang.show()

In [None]:
tempdf = ps.DataFrame(dfLang)

tempdf.plot(kind='bar', x='lang', y='TweetCount')

How many tweets by location

In [None]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM tweets GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()

In [None]:
tempdf = ps.DataFrame(dfLoc)

tempdf.plot(kind='bar', x='Location', y='TweetCount')

Now limit the dataset to English texts and tweets with the Euro in the text

In [None]:
dfEnTwt = spark.sql("SELECT * FROM tweets WHERE lang = 'en' AND text LIKE '%euro%'")
dfEnTwt.createOrReplaceTempView("en_tweets")
dfEnTwt.show()

Now how many tweets in the English language dataset

In [342]:
dfEnTwt.count()

35

In [343]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM en_tweets GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()

+--------------------+----------+
|            Location|TweetCount|
+--------------------+----------+
|                null|        11|
|         Calgary, AB|         1|
|           Edinburgh|         1|
|16 ! (l)gbt ! sw ...|         1|
|       Bath, England|         1|
|mdni. 23ooc! nrs ...|         1|
|                1314|         1|
|             Philly |         1|
|      i'm inside you|         1|
|          Boston, MA|         1|
+--------------------+----------+



In [344]:
# select tweet id, geo, lang, quoted_status,quoted_status.geo 
# Having a look at some the data
dfOne = spark.sql("SELECT DISTINCT id,  text, quote_count, reply_count, retweet_count, favorite_count, geo, place, lang, \
                  quoted_status,quoted_status.geo, quoted_status.text, user.name, user.location   \
                  FROM en_tweets")

In [345]:
dfOne.show()

[Stage 1178:>                                                       (0 + 1) / 1]

+-------------------+--------------------+-----------+-----------+-------------+--------------+----+-----+----+--------------------+----+--------------------+--------------------+-------------------+
|                 id|                text|quote_count|reply_count|retweet_count|favorite_count| geo|place|lang|       quoted_status| geo|                text|                name|           location|
+-------------------+--------------------+-----------+-----------+-------------+--------------+----+-----+----+--------------------+----+--------------------+--------------------+-------------------+
|1444780630374588417|RT @StockholmCF: ...|          0|          0|            0|             0|null| null|  en|                null|null|                null|Sir Boddington MB...|        Switzerland|
|1424665478170136578|@BillyHynes @Scot...|          0|          0|            0|             0|null| null|  en|                null|null|                null|Paul Harrison-Davies|               null|


                                                                                

In [346]:

dfDay= spark.sql("SELECT DISTINCT CAST(substring(created_at, 27, 4) AS INT) as Year, \
          CAST(from_unixtime(unix_timestamp(substring(created_at, 5, 3), 'MMM'), 'MM') As INT) as Month, \
          CAST(substring(created_at, 9, 2) AS INT) as Day, \
          CAST(count(id) AS INT) as TweetCount \
          FROM en_tweets \
          WHERE lang = 'en' AND text LIKE '%euro%' GROUP BY substring(created_at, 27, 4), \
          substring(created_at, 5, 3), \
          substring(created_at, 9, 2)")

dfDay.createOrReplaceTempView("tweetsByDay")

dfDay = spark.sql("SELECT CONCAT(Year, '_', Month, '_', DAY) AS Date, TweetCount  FROM tweetsByDay ORDER BY Year, Month, Day")

dfDay.show()


+---------+----------+
|     Date|TweetCount|
+---------+----------+
| 2021_8_9|        20|
|2021_8_10|         5|
|2021_10_3|        10|
+---------+----------+



In [347]:
tempdf = ps.DataFrame(dfDay)

tempdf.plot(kind='bar', x='Date', y='TweetCount')

2023-05-14 15:23:58,078 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:23:58,081 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:23:58,325 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:23:58,432 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:23:58,857 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:23:58,871 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [348]:
dfWeek = spark.sql("SELECT Year, weekofyear(make_date(Year, Month, Day)) as wkofYr , SUM(TweetCount) as TweetCountbyWeek \
                   FROM tweetsByDay \
                   GROUP BY Year, weekofyear(make_date(Year, Month, Day))")

dfWeek.createOrReplaceTempView("tweetsByWeek")
dfWeek = spark.sql("SELECT Year, wkofYr, CONCAT(Year, '_', wkofYr) AS yr_wk, TweetCountbyWeek  FROM tweetsByWeek ORDER BY Year, wkofYr")
dfWeek.show()

+----+------+-------+----------------+
|Year|wkofYr|  yr_wk|TweetCountbyWeek|
+----+------+-------+----------------+
|2021|    32|2021_32|              25|
|2021|    39|2021_39|              10|
+----+------+-------+----------------+



In [349]:
tempdf = ps.DataFrame(dfWeek)

tempdf.plot(kind='bar', x='yr_wk', y='TweetCountbyWeek')

2023-05-14 15:24:02,999 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:03,003 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:03,282 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:03,384 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:03,543 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:04,418 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [350]:
# get the tweet count by month
dfMonth = spark.sql("SELECT Year, Month, SUM(TweetCount) as TweetCountbyMonth \
                     FROM tweetsByDay GROUP BY Year, Month")
dfMonth.createOrReplaceTempView("tweetsByMonth")
dfMonth = spark.sql("SELECT Year, Month, CONCAT(Year, '_', Month) AS MonthYr, TweetCountbyMonth  FROM tweetsByMonth ORDER BY Year, Month ")
dfMonth.show()

+----+-----+-------+-----------------+
|Year|Month|MonthYr|TweetCountbyMonth|
+----+-----+-------+-----------------+
|2021|    8| 2021_8|               25|
|2021|   10|2021_10|               10|
+----+-----+-------+-----------------+



In [351]:
tempdf = ps.DataFrame(dfMonth)

tempdf.plot(kind='bar', x='MonthYr', y='TweetCountbyMonth')

2023-05-14 15:24:07,429 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:07,433 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:07,797 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:07,953 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:08,133 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 15:24:08,469 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

### Next up is text clean up 

In [368]:
#%pip install wordcloud
#%pip install vadersentiment
## sentiment analysis ref: https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/?ref=lbp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    print("Overall sentiment dictionary is : ", sentiment_dict)
    #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
 
    elif sentiment_dict['compound'] <= - 0.05 :
       print("Negative")
 
    else :
        print("Neutral")
    
    return sentiment_dict['compound']

In [369]:
dfText = spark.sql("SELECT DISTINCT id, text AS text FROM en_tweets")
dfText.createOrReplaceTempView("text")
dfText.show()


+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1444701521585840131|RT @sevian_frang:...|
|1444534676375416833|roman reloaded is...|
|1424818670937812992|its okay Obi, las...|
|1444665672865361922|RT @business: The...|
|1444705724295049220|@OfficialTravlad ...|
|1424950707652808710|RT @HumairIqbal1:...|
|1444780630374588417|RT @StockholmCF: ...|
|1424814879303733249|@swiftftbridgers ...|
|1424816380877197312|@Lclimateguy @San...|
|1424770658765049858|@talkSPORT If Mes...|
|1424783707219628034|@Tesla @elonmusk ...|
|1424804859124068355|imagining winning...|
|1424805534394458115|ended up paying h...|
|1424881585506107394|RT @Chowman301031...|
|1424922689685307392|#darceyandstacey ...|
|1424839587948531717|@EURO2020 Toats e...|
|1424850673473134601|My euro step to t...|
|1424711758149668864|RT @CovertTorture...|
|1424797804300423178|@BlakeMurphyODC W...|
|1444726385445060618|@lauriedunsire I ...|
+----------

Count the number of words in the tweets 

In [370]:
from pyspark.sql.functions import * 
from pyspark.sql.types import StringType, ArrayType
# heavy reliance on SQL functions in the following code

dfWord = dfText.withColumn("Word", explode(split(col("text"), ' '))).groupBy("Word").count().orderBy(desc("count"))

dfWord.show()

+----+-----+
|Word|count|
+----+-----+
|euro|   22|
|  to|   12|
| the|   11|
|  in|   10|
|  RT|   10|
|   a|    9|
| and|    9|
| for|    7|
|  is|    6|
|   u|    5|
| you|    4|
| are|    4|
| was|    4|
|have|    4|
|  of|    4|
| not|    4|
|   i|    4|
|   I|    4|
|your|    3|
|  go|    3|
+----+-----+
only showing top 20 rows



Count the number of characters including spaces

In [371]:
dfChar = spark.sql("SELECT text, LENGTH(text) AS char FROM text ORDER BY char DESC")
dfChar.show(5)

+--------------------+----+
|                text|char|
+--------------------+----+
|RT @Chowman301031...| 140|
|RT @burntheworldy...| 140|
|#darceyandstacey ...| 140|
|RT @business: The...| 140|
|RT @HumairIqbal1:...| 140|
+--------------------+----+
only showing top 5 rows



Check for special characters i.e. Hashtags

In [372]:
dfSpecChar = spark.sql("SELECT text, regexp_extract_all(text, '(#\\\\w+)', 1) AS Hashtags FROM text WHERE text like '%#%' ")
dfSpecChar.show()

+--------------------+--------------------+
|                text|            Hashtags|
+--------------------+--------------------+
|@OfficialTravlad ...|        [#Travlegit]|
|RT @StockholmCF: ...|[#EU, #SyrianRefu...|
|#darceyandstacey ...|  [#darceyandstacey]|
|RT @CovertTorture...|      [#vaccinePass]|
|@BlakeMurphyODC W...|                [#1]|
|RT @roro_euro: rt...|          [#imgxnct]|
+--------------------+--------------------+



Check for upper case 

In [373]:
dfUpper = spark.sql("SELECT id, text FROM text ")

dfUpper.show()

+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1444701521585840131|RT @sevian_frang:...|
|1444534676375416833|roman reloaded is...|
|1424818670937812992|its okay Obi, las...|
|1444665672865361922|RT @business: The...|
|1444705724295049220|@OfficialTravlad ...|
|1424950707652808710|RT @HumairIqbal1:...|
|1444780630374588417|RT @StockholmCF: ...|
|1424814879303733249|@swiftftbridgers ...|
|1424816380877197312|@Lclimateguy @San...|
|1424770658765049858|@talkSPORT If Mes...|
|1424783707219628034|@Tesla @elonmusk ...|
|1424804859124068355|imagining winning...|
|1424805534394458115|ended up paying h...|
|1424881585506107394|RT @Chowman301031...|
|1424922689685307392|#darceyandstacey ...|
|1424839587948531717|@EURO2020 Toats e...|
|1424850673473134601|My euro step to t...|
|1424711758149668864|RT @CovertTorture...|
|1424797804300423178|@BlakeMurphyODC W...|
|1444726385445060618|@lauriedunsire I ...|
+----------

In [374]:

pattern = r'[^a-zA-Z0-9\s]'

dfText = spark.sql("SELECT * FROM text")
dfText = dfText.withColumn("text", regexp_replace('text', pattern, ''))
dfText.take(5)

[Row(id=1444701521585840131, text='RT sevianfrang gonna plan a four week euro trip and leave my phone at home '),
 Row(id=1444534676375416833, text='roman reloaded is euro dance and hyper pop which was very popular in the 2010s a completely different sound from p httpstcort8PxqFjQh'),
 Row(id=1424818670937812992, text='its okay Obi last year you had no idea what a eurostep was and now were trying'),
 Row(id=1444665672865361922, text='RT business The governments 18 billioneuro package sets the economic agenda for the remainder of Chancellor Sebastian Kurzs term https'),
 Row(id=1444705724295049220, text='OfficialTravlad projecteuro Thans brother Travlegit ')]

In [375]:
dfText.createOrReplaceTempView("text") # this is cleaned txt
dfText = spark.sql("SELECT LOWER(TRIM(text)) AS text FROM text")

dfText.take(5)

[Row(text='rt sevianfrang gonna plan a four week euro trip and leave my phone at home'),
 Row(text='roman reloaded is euro dance and hyper pop which was very popular in the 2010s a completely different sound from p httpstcort8pxqfjqh'),
 Row(text='its okay obi last year you had no idea what a eurostep was and now were trying'),
 Row(text='rt business the governments 18 billioneuro package sets the economic agenda for the remainder of chancellor sebastian kurzs term https'),
 Row(text='officialtravlad projecteuro thans brother travlegit')]

In [376]:
#now build a word cloud
from wordcloud import WordCloud, STOPWORDS
dfText.createOrReplaceTempView("text") # this is the trimmed txt
dfText = spark.sql("SELECT text FROM text")
dfText.show(5)

+--------------------+
|                text|
+--------------------+
|rt sevianfrang go...|
|roman reloaded is...|
|its okay obi last...|
|rt business the g...|
|officialtravlad p...|
+--------------------+
only showing top 5 rows



### Tokenize and Stem the tweets

In [377]:
#%pip install nltk

In [378]:
# stackoverflow ref: https://stackoverflow.com/questions/53579444

from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from nltk.stem.snowball import SnowballStemmer

In [379]:
# Tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
dfText = tokenizer.transform(dfText).select("text","tokens")

dfText.show(5)

+--------------------+--------------------+
|                text|              tokens|
+--------------------+--------------------+
|rt sevianfrang go...|[rt, sevianfrang,...|
|roman reloaded is...|[roman, reloaded,...|
|its okay obi last...|[its, okay, obi, ...|
|rt business the g...|[rt, business, th...|
|officialtravlad p...|[officialtravlad,...|
+--------------------+--------------------+
only showing top 5 rows



In [380]:
# Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
dfText = remover.transform(dfText).select("text","tokens","filtered")

dfText.show(5)

+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|
+--------------------+--------------------+--------------------+
|rt sevianfrang go...|[rt, sevianfrang,...|[rt, sevianfrang,...|
|roman reloaded is...|[roman, reloaded,...|[roman, reloaded,...|
|its okay obi last...|[its, okay, obi, ...|[okay, obi, last,...|
|rt business the g...|[rt, business, th...|[rt, business, go...|
|officialtravlad p...|[officialtravlad,...|[officialtravlad,...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [381]:
# stem the words
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", stemmer_udf("filtered"))
dfText.show(5)

[Stage 1332:>                                                       (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|    filtered_stemmed|
+--------------------+--------------------+--------------------+--------------------+
|rt sevianfrang go...|[rt, sevianfrang,...|[rt, sevianfrang,...|[rt, sevianfrang,...|
|roman reloaded is...|[roman, reloaded,...|[roman, reloaded,...|[roman, reload, e...|
|its okay obi last...|[its, okay, obi, ...|[okay, obi, last,...|[okay, obi, last,...|
|rt business the g...|[rt, business, th...|[rt, business, go...|[rt, busi, govern...|
|officialtravlad p...|[officialtravlad,...|[officialtravlad,...|[officialtravlad,...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [382]:
# Filter out short words
filterShortWords = udf(lambda row: [x for x in row if len(x) >= 4], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", filterShortWords("filtered_stemmed"))

dfText.show(5)


[Stage 1335:>                                                       (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|    filtered_stemmed|
+--------------------+--------------------+--------------------+--------------------+
|rt sevianfrang go...|[rt, sevianfrang,...|[rt, sevianfrang,...|[sevianfrang, gon...|
|roman reloaded is...|[roman, reloaded,...|[roman, reloaded,...|[roman, reload, e...|
|its okay obi last...|[its, okay, obi, ...|[okay, obi, last,...|[okay, last, year...|
|rt business the g...|[rt, business, th...|[rt, business, go...|[busi, govern, bi...|
|officialtravlad p...|[officialtravlad,...|[officialtravlad,...|[officialtravlad,...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                