In [None]:
# Find all the tweets by user
# Find how many tweets each user has
# Find all the persons mentioned on tweets
# Count how many times each person is mentioned
# Find the 10 most mentioned persons
# Find all the hashtags mentioned on a tweet
# Count how many times each hashtag is mentioned
# Find the 10 most popular Hashtags
# Find the top 5 countries which tweet the most

## Load Data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
            .builder\
            .appName('twitter')\
            .config('spark.driver.extraClassPath', '/usr/lib/jvm/java-19-openjdk/lib/postgresql-42.5.0.jar')\
            .getOrCreate()

22/10/21 14:14:57 WARN Utils: Your hostname, tars resolves to a loopback address: 127.0.1.1; using 192.168.1.66 instead (on interface wlan0)
22/10/21 14:14:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/21 14:14:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.types import StructField, StructType, StringType


spark.read.format('json').load('data/tweets.json').schema

manual_schema = StructType([
    StructField('country', StringType(), True),
    StructField('id', StringType(), True),
    StructField('place', StringType(), True),
    StructField('text', StringType(), True),
    StructField('user', StringType(), True)
])

twitter_df = spark.read.format('json').schema(manual_schema).load('data/tweets.json')

twitter_df.printSchema()
twitter_df.show(3)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

root
 |-- country: string (nullable = true)
 |-- id: string (nullable = true)
 |-- place: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)

+-------------+------------------+---------+--------------------+------------------+
|      country|                id|    place|                text|              user|
+-------------+------------------+---------+--------------------+------------------+
|        India|572692378957430785|   Orissa|@always_nidhi @Yo...|   Srkian_nishu :)|
|United States|572575240615796737|Manhattan|@OnlyDancers Bell...|TagineDiningGlobal|
|United States|572575243883036672|Claremont|1/ "Without the a...|       Daniel Beer|
+-------------+------------------+---------+--------------------+------------------+
only showing top 3 rows



In [27]:
# upload twitter_df to csv
twitter_df.toPandas().to_csv('output/twitter/tweets.csv', index=False)

## 1. Find all the tweets by user

In [7]:
from pyspark.sql.functions import col


user = input('Enter the user whose tweets to see: ')
twitter_df.filter(col('user')==user).show()

Enter the user whose tweets to see: Daniel Beer
+-------------+------------------+---------+--------------------+-----------+
|      country|                id|    place|                text|       user|
+-------------+------------------+---------+--------------------+-----------+
|United States|572575243883036672|Claremont|1/ "Without the a...|Daniel Beer|
+-------------+------------------+---------+--------------------+-----------+



In [28]:
# to csv
twitter_df.filter(col('user')==user).toPandas().to_csv('output/twitter/tweets_of_user.csv', index=False)

## 2. Find how many tweets each user has

In [9]:
num_tweets_df = twitter_df\
    .groupBy(col('user'))\
    .count()\
    .withColumnRenamed('count', 'num_tweets')\
    .orderBy(col('count').desc())
num_tweets_df.show()

+--------------------+----------+
|                user|num_tweets|
+--------------------+----------+
|       #QuissyUpSoon|       258|
|Inès Mendes Askiip ♥|       185|
|           #4Rentinc|       100|
|                  MV|        58|
|    williampriceking|        46|
|✌ Follow Me MAEJOR ✌|        44|
|    Phillthy McNasty|        43|
|       K.O.H.O.R.T.S|        41|
|  #AMNT KINGTAECRAZY|        41|
|        Ghafla.co.ke|        36|
|        Ully U Music|        35|
|            Codeclic|        33|
|  TagineDiningGlobal|        30|
|           Lord Dash|        30|
|      Herri Setiawan|        29|
|          Dell Feddi|        29|
|   Kidrauhl Forever❤|        25|
|     Trendsmap Paris|        23|
|      #TurnYaSneakUp|        22|
|                Bel |        19|
+--------------------+----------+
only showing top 20 rows



In [29]:
# to csv
num_tweets_df.toPandas().to_csv('output/twitter/num_tweets.csv', index=False)

## 3. Find all the persons mentioned on tweets

In [30]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType

def generate_mentioned_user_list(text):
    return [item.lstrip('@') for item in text.split(' ') if item.startswith('@')]


twitter_df_with_mentioned = twitter_df.withColumn('users_mentioned', udf(lambda text: generate_mentioned_user_list(text), ArrayType(StringType()))(col('text')))
twitter_df_with_mentioned.show(5)

+-------------+------------------+---------+--------------------+------------------+--------------------+
|      country|                id|    place|                text|              user|     users_mentioned|
+-------------+------------------+---------+--------------------+------------------+--------------------+
|        India|572692378957430785|   Orissa|@always_nidhi @Yo...|   Srkian_nishu :)|[always_nidhi, Yo...|
|United States|572575240615796737|Manhattan|@OnlyDancers Bell...|TagineDiningGlobal|       [OnlyDancers]|
|United States|572575243883036672|Claremont|1/ "Without the a...|       Daniel Beer|                  []|
|United States|572575252020109313|   Vienna|idk why people ha...|  someone actually|                  []|
|United States|572575274539356160|   Boston|Taste of Iceland!...|    BostonAttitude|    [IcelandNatural]|
+-------------+------------------+---------+--------------------+------------------+--------------------+
only showing top 5 rows



In [44]:
# to csv
twitter_df_with_mentioned.toPandas().to_csv('output/twitter/mentioned_users.csv', index=False)

## 4. Count how many times each person is mentioned

In [32]:
from pyspark.sql.functions import explode

# explode flattens the previous step array into a column
mentioned_only_df = twitter_df_with_mentioned.select(explode(col('users_mentioned')).alias('users_mentioned'))

# the users_mentioned list contains '' also, so exclude that
mentioned_only_df = mentioned_only_df.filter(col('users_mentioned') != '')

# count of mentioned users
mentioned_only_df.groupBy('users_mentioned').count().show(truncate=False)

+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|DjRockyUg      |1    |
|TrillHD        |1    |
|TimmysWell     |1    |
|brookie_baldwin|1    |
|TTTorrez       |2    |
|boytoyjesse    |1    |
|misstoriblack  |1    |
|globalstatmusic|1    |
|_fuckgio       |1    |
|PedroIvoChianca|1    |
|Cpiepz         |1    |
|avachristy3    |1    |
|lostbayouramble|1    |
|bellahadid     |1    |
|sawano_nZk's   |1    |
|marIboros      |1    |
|kochamjacksona |1    |
|WIOD           |2    |
|ShaelynCherie  |2    |
|KevinAnex      |1    |
+---------------+-----+
only showing top 20 rows



In [45]:
# to csv
mentioned_only_df.groupBy('users_mentioned').count().toPandas().to_csv('output/twitter/mentioned_users_count.csv', index=False)

## 5. Find the 10 most mentioned persons

In [34]:
top_mentioned_df = mentioned_only_df\
    .groupBy('users_mentioned')\
    .count().orderBy(col('count').desc())\
    .limit(10)

top_mentioned_df.show()

+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|    ShawnMendes|  189|
|  HIITMANonDECK|  100|
|officialdjjuice|   59|
|         MAEJOR|   45|
|    MR_JAYJONES|   41|
|       MeekMill|   35|
|MadisonElleBeer|   30|
|              …|   28|
|     DjLordDash|   27|
|     NICKIMINAJ|   25|
+---------------+-----+



In [35]:
# to csv
top_mentioned_df.toPandas().to_csv('output/twitter/top_mentioned.csv', index=False)

## 6. Find all the hashtags mentioned on a tweet

In [36]:
def generate_hashtags_list(text):
    return [item for item in text.split(' ') if item.startswith('#')]

twitter_df_with_hashtags = twitter_df.withColumn('hashtags', udf(lambda text: generate_hashtags_list(text), ArrayType(StringType()))(col('text')))
twitter_df_with_hashtags.show(5)

+-------------+------------------+---------+--------------------+------------------+--------+
|      country|                id|    place|                text|              user|hashtags|
+-------------+------------------+---------+--------------------+------------------+--------+
|        India|572692378957430785|   Orissa|@always_nidhi @Yo...|   Srkian_nishu :)|      []|
|United States|572575240615796737|Manhattan|@OnlyDancers Bell...|TagineDiningGlobal|      []|
|United States|572575243883036672|Claremont|1/ "Without the a...|       Daniel Beer|      []|
|United States|572575252020109313|   Vienna|idk why people ha...|  someone actually|      []|
|United States|572575274539356160|   Boston|Taste of Iceland!...|    BostonAttitude|      []|
+-------------+------------------+---------+--------------------+------------------+--------+
only showing top 5 rows



In [37]:
# to csv
twitter_df_with_hashtags.toPandas().to_csv('output/twitter/hashtags.csv', index=False)

## 7. Count how many times each hashtag is mentioned

In [38]:
hashtags_only_df = twitter_df_with_hashtags.select(explode(col('hashtags')).alias('hashtags')).filter(col('hashtags') != '')

hashtags_only_df\
    .groupBy('hashtags')\
    .count()\
    .show()

+--------------------+-----+
|            hashtags|count|
+--------------------+-----+
|               #2NE1|    3|
|         #musicLover|    1|
|           #IBMCloud|    2|
|#flexrecordingstudio|    1|
|            #Hottest|    1|
|        #VanessaBorn|    1|
|        #happychappy|    1|
|          #yyjevents|    1|
|      #LittleLionMan|    1|
|           #MBAMBADU|    7|
|     #misheardlyrics|    1|
|              #Indie|    2|
|             #family|    1|
|          #beautiful|    2|
|             #Waiter|    1|
|             #friend|    1|
|    #recuseimitaçoes|    1|
|             #airbnb|    1|
|              #BØRNS|    1|
|         #ChickCorea|    1|
+--------------------+-----+
only showing top 20 rows



In [39]:
# to csv
hashtags_only_df.groupBy('hashtags').count().toPandas().to_csv('output/twitter/hashtag_count.csv', index=False)

## 8. Find the 10 most popular Hashtags

In [40]:
top_hashtags_df = hashtags_only_df\
    .groupBy('hashtags')\
    .count()\
    .orderBy(col('count').desc())\
    .limit(10)

top_hashtags_df.show()

+-------------------+-----+
|           hashtags|count|
+-------------------+-----+
|               #DME|  253|
|          #ROADBOYZ|  251|
|             #music|  236|
|             #Paris|  144|
|#QuissyUpSoon🔥🔥💯|  129|
|      #QuissyUpSoon|  120|
| #Trippythursdaymia|  100|
|             #Music|   84|
|    #MaejorMeAndYou|   44|
|              #IGGL|   41|
+-------------------+-----+



In [41]:
# to csv
top_hashtags_df.toPandas().to_csv('output/twitter/top_hashtags.csv', index=False)

## 9. Find the top 5 countries which tweet the most

In [42]:
top_countries_df = twitter_df\
    .groupBy('country')\
    .count()\
    .withColumnRenamed('count', 'num_tweets')\
    .orderBy(col('num_tweets').desc())\
    .limit(5)

top_countries_df.show()

+--------------+----------+
|       country|num_tweets|
+--------------+----------+
| United States|      4841|
|        France|       737|
|     Indonesia|       370|
|United Kingdom|       365|
|        Brasil|       256|
+--------------+----------+



In [43]:
# to csv
top_countries_df.toPandas().to_csv('output/twitter/top_countries.csv', index=False)