In [None]:
# Find all the tweets by user
# Find how many tweets each user has
# Find all the persons mentioned on tweets
# Count how many times each person is mentioned
# Find the 10 most mentioned persons
# Find all the hashtags mentioned on a tweet
# Count how many times each hashtag is mentioned
# Find the 10 most popular Hashtags
# Find the top 5 countries which tweet the most


## Load Data

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('twitter').getOrCreate()

In [88]:
from pyspark.sql.types import StructField, StructType, StringType


spark.read.format('json').load('data/tweets.json').schema

manual_schema = StructType([
    StructField('country', StringType(), True),
    StructField('id', StringType(), True),
    StructField('place', StringType(), True),
    StructField('text', StringType(), True),
    StructField('user', StringType(), True)
])

twitter_df = spark.read.format('json').schema(manual_schema).load('data/tweets.json')

twitter_df.printSchema()
twitter_df.show(3)

root
 |-- country: string (nullable = true)
 |-- id: string (nullable = true)
 |-- place: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user: string (nullable = true)

+-------------+------------------+---------+--------------------+------------------+
|      country|                id|    place|                text|              user|
+-------------+------------------+---------+--------------------+------------------+
|        India|572692378957430785|   Orissa|@always_nidhi @Yo...|   Srkian_nishu :)|
|United States|572575240615796737|Manhattan|@OnlyDancers Bell...|TagineDiningGlobal|
|United States|572575243883036672|Claremont|1/ "Without the a...|       Daniel Beer|
+-------------+------------------+---------+--------------------+------------------+
only showing top 3 rows



## 1. Find all the tweets by user

In [89]:
from pyspark.sql.functions import col


user = input('Enter the user whose tweets to see: ')
twitter_df.filter(col('user')==user).show()

Enter the user whose tweets to see: Daniel Beer
+-------------+------------------+---------+--------------------+-----------+
|      country|                id|    place|                text|       user|
+-------------+------------------+---------+--------------------+-----------+
|United States|572575243883036672|Claremont|1/ "Without the a...|Daniel Beer|
+-------------+------------------+---------+--------------------+-----------+



## 2. Find how many tweets each user has

In [90]:
twitter_df\
    .groupBy(col('user'))\
    .count()\
    .withColumnRenamed('count', 'num_tweets')\
    .orderBy(col('count').desc())\
    .show(5)

+--------------------+----------+
|                user|num_tweets|
+--------------------+----------+
|       #QuissyUpSoon|       258|
|Inès Mendes Askiip ♥|       185|
|           #4Rentinc|       100|
|                  MV|        58|
|    williampriceking|        46|
+--------------------+----------+
only showing top 5 rows



## 3. Find all the persons mentioned on tweets

In [92]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType

def generate_mentioned_user_list(text):
    return [item.lstrip('@') for item in text.split(' ') if item.startswith('@')]


twitter_df_with_mentioned = twitter_df.withColumn('users_mentioned', udf(lambda text: generate_mentioned_user_list(text), ArrayType(StringType()))(col('text')))
twitter_df_with_mentioned.show()

+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|             country|                id|            place|                text|               user|     users_mentioned|
+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|[always_nidhi, Yo...|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|       [OnlyDancers]|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|                  []|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|                  []|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|    [IcelandNatural]|
|       United States|57

## 4. Count how many times each person is mentioned

In [93]:
from pyspark.sql.functions import explode


# explode flattens the previous step array into a column
mentioned_only_df = twitter_df_with_mentioned.select(explode(col('users_mentioned')).alias('users_mentioned'))

# the users_mentioned list contains '' also, so exclude that
mentioned_only_df = mentioned_only_df.filter(col('users_mentioned') != '')

# count of mentioned users
mentioned_only_df.groupBy('users_mentioned').count().show(truncate=False)

+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|DjRockyUg      |1    |
|TrillHD        |1    |
|TimmysWell     |1    |
|brookie_baldwin|1    |
|TTTorrez       |2    |
|boytoyjesse    |1    |
|misstoriblack  |1    |
|globalstatmusic|1    |
|_fuckgio       |1    |
|PedroIvoChianca|1    |
|Cpiepz         |1    |
|avachristy3    |1    |
|lostbayouramble|1    |
|bellahadid     |1    |
|sawano_nZk's   |1    |
|marIboros      |1    |
|kochamjacksona |1    |
|WIOD           |2    |
|ShaelynCherie  |2    |
|KevinAnex      |1    |
+---------------+-----+
only showing top 20 rows



## 5. Find the 10 most mentioned persons

In [94]:
mentioned_only_df.count()

mentioned_only_df\
    .groupBy('users_mentioned')\
    .count().orderBy(col('count').desc())\
    .limit(10)\
    .show()

+---------------+-----+
|users_mentioned|count|
+---------------+-----+
|    ShawnMendes|  189|
|  HIITMANonDECK|  100|
|officialdjjuice|   59|
|         MAEJOR|   45|
|    MR_JAYJONES|   41|
|       MeekMill|   35|
|MadisonElleBeer|   30|
|              …|   28|
|     DjLordDash|   27|
|     NICKIMINAJ|   25|
+---------------+-----+



## 6. Find all the hashtags mentioned on a tweet

In [95]:
def generate_hashtags_list(text):
    return [item for item in text.split(' ') if item.startswith('#')]

twitter_df_with_hashtags = twitter_df.withColumn('hashtags', udf(lambda text: generate_hashtags_list(text), ArrayType(StringType()))(col('text')))
twitter_df_with_hashtags.show()

+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|             country|                id|            place|                text|               user|            hashtags|
+--------------------+------------------+-----------------+--------------------+-------------------+--------------------+
|               India|572692378957430785|           Orissa|@always_nidhi @Yo...|    Srkian_nishu :)|                  []|
|       United States|572575240615796737|        Manhattan|@OnlyDancers Bell...| TagineDiningGlobal|                  []|
|       United States|572575243883036672|        Claremont|1/ "Without the a...|        Daniel Beer|                  []|
|       United States|572575252020109313|           Vienna|idk why people ha...|   someone actually|                  []|
|       United States|572575274539356160|           Boston|Taste of Iceland!...|     BostonAttitude|                  []|
|       United States|57

## 7. Count how many times each hashtag is mentioned

In [96]:
hashtags_only_df = twitter_df_with_hashtags.select(explode(col('hashtags')).alias('hashtags')).filter(col('hashtags') != '')

hashtags_only_df\
    .groupBy('hashtags')\
    .count()\
    .show()

+--------------------+-----+
|            hashtags|count|
+--------------------+-----+
|               #2NE1|    3|
|         #musicLover|    1|
|           #IBMCloud|    2|
|#flexrecordingstudio|    1|
|            #Hottest|    1|
|        #VanessaBorn|    1|
|        #happychappy|    1|
|          #yyjevents|    1|
|      #LittleLionMan|    1|
|           #MBAMBADU|    7|
|     #misheardlyrics|    1|
|              #Indie|    2|
|             #family|    1|
|          #beautiful|    2|
|             #Waiter|    1|
|             #friend|    1|
|    #recuseimitaçoes|    1|
|             #airbnb|    1|
|              #BØRNS|    1|
|         #ChickCorea|    1|
+--------------------+-----+
only showing top 20 rows



## 8. Find the 10 most popular Hashtags

In [97]:
hashtags_only_df\
    .groupBy('hashtags')\
    .count()\
    .orderBy(col('count').desc())\
    .limit(10)\
    .show()

+-------------------+-----+
|           hashtags|count|
+-------------------+-----+
|               #DME|  253|
|          #ROADBOYZ|  251|
|             #music|  236|
|             #Paris|  144|
|#QuissyUpSoon🔥🔥💯|  129|
|      #QuissyUpSoon|  120|
| #Trippythursdaymia|  100|
|             #Music|   84|
|    #MaejorMeAndYou|   44|
|              #IGGL|   41|
+-------------------+-----+



## 9. Find the top 5 countries which tweet the most

In [98]:
twitter_df\
    .groupBy('country')\
    .count()\
    .withColumnRenamed('count', 'num_tweets')\
    .orderBy(col('num_tweets').desc())\
    .limit(5)\
    .show()

+--------------+----------+
|       country|num_tweets|
+--------------+----------+
| United States|      4841|
|        France|       737|
|     Indonesia|       370|
|United Kingdom|       365|
|        Brasil|       256|
+--------------+----------+

