# Challenge Resolution

In [None]:
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

## Installing dependencies

In [92]:
!pip install pyspark
!pip uninstall emoji
!pip install advertools

Found existing installation: emoji 2.8.0
Uninstalling emoji-2.8.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/emoji-2.8.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/emoji/*
Proceed (Y/n)? y
  Successfully uninstalled emoji-2.8.0


# Solución problema 1:


1.   Top 10 fechas con más tweets
2.   Mostrar el nombre de los usuarios que más tweets realizaron en esas fechas



In [21]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Latamchallenge") \
    .getOrCreate()

In [22]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [23]:
rdata = spark.read.json('/content/drive/MyDrive/data/farmers-protest-tweets-2021-2-4.json')
print(rdata.printSchema())
rdata.show(5)
print((rdata.count(), len(rdata.columns)))

root
 |-- content: string (nullable = true)
 |-- conversationId: long (nullable = true)
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- likeCount: long (nullable = true)
 |-- media: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- duration: double (nullable = true)
 |    |    |-- fullUrl: string (nullable = true)
 |    |    |-- previewUrl: string (nullable = true)
 |    |    |-- thumbnailUrl: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- variants: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- bitrate: long (nullable = true)
 |    |    |    |    |-- contentType: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)
 |-- mentionedUsers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |   

In [24]:
# Filtro la raw data quedándome sólo con las columnas que necesito para este problema.
q1_data = rdata.select('date', 'user.username').withColumn("date", F.to_date('date'))
q1_data.show(5)

+----------+---------------+
|      date|       username|
+----------+---------------+
|2021-02-24|ArjunSinghPanam|
|2021-02-24|     PrdeepNain|
|2021-02-24| parmarmaninder|
|2021-02-24|  anmoldhaliwal|
|2021-02-24|     KotiaPreet|
+----------+---------------+
only showing top 5 rows



In [25]:
# Realizo un conteo por fecha y por usuario.
# Me servirá para luego sacar los top10 por fecha y por usuario
date_activity = q1_data.groupBy('date', 'username').count()
date_activity.show()

+----------+---------------+-----+
|      date|       username|count|
+----------+---------------+-----+
|2021-02-24| BumblebeeUmeed|    1|
|2021-02-24|       htTweets|    2|
|2021-02-24|       v_sanjai|    1|
|2021-02-24|     dr_sonia27|    1|
|2021-02-24|   RamneetMann4|    1|
|2021-02-24|   JPSinghRuhil|    2|
|2021-02-24|   BjpReporting|    1|
|2021-02-24| manesh67726670|    1|
|2021-02-24|        AKulvir|    1|
|2021-02-24|       HAchahal|    1|
|2021-02-24| AnuragVerma_SP|    1|
|2021-02-24|Gurwind33930102|    3|
|2021-02-23|   TechieKisaan|    2|
|2021-02-23| MindsetMatter1|    1|
|2021-02-23|  Nimratkhaira_|    1|
|2021-02-23|    Lats_tweets|    1|
|2021-02-23|Deshbha99450233|    1|
|2021-02-23|     SecondEye5|    1|
|2021-02-23|     alieshan09|    1|
|2021-02-23|      SikhWhite|    1|
+----------+---------------+-----+
only showing top 20 rows



In [136]:
# Realizo una sumarización del conteo por fecha
date_total_activity = date_activity.groupBy('date').agg(F.sum('count').alias('total_activity'))
date_total_activity.show()

+----------+--------------+
|      date|total_activity|
+----------+--------------+
|2021-02-15|          9197|
|2021-02-21|          7532|
|2021-02-19|          8204|
|2021-02-20|          8502|
|2021-02-17|         11087|
|2021-02-24|          3437|
|2021-02-18|          9625|
|2021-02-14|         10249|
|2021-02-12|         12347|
|2021-02-22|          7071|
|2021-02-13|         11296|
|2021-02-23|          8417|
|2021-02-16|         10443|
+----------+--------------+



In [137]:
# Mediante la función analítica genero un ranking de la actividad por día
window_spec = Window.orderBy(F.desc("total_activity"))
date_total_activity = date_total_activity.withColumn("rank", F.row_number().over(window_spec))

date_total_activity.show()

+----------+--------------+----+
|      date|total_activity|rank|
+----------+--------------+----+
|2021-02-12|         12347|   1|
|2021-02-13|         11296|   2|
|2021-02-17|         11087|   3|
|2021-02-16|         10443|   4|
|2021-02-14|         10249|   5|
|2021-02-18|          9625|   6|
|2021-02-15|          9197|   7|
|2021-02-20|          8502|   8|
|2021-02-23|          8417|   9|
|2021-02-19|          8204|  10|
|2021-02-21|          7532|  11|
|2021-02-22|          7071|  12|
|2021-02-24|          3437|  13|
+----------+--------------+----+



In [138]:
# Realizo un filtro por la columna ranking quedándome sólo con las menores o iguales a 10
top_10_dates = date_total_activity.filter("rank <= 10")
top_10_dates.show()

+----------+--------------+----+
|      date|total_activity|rank|
+----------+--------------+----+
|2021-02-12|         12347|   1|
|2021-02-13|         11296|   2|
|2021-02-17|         11087|   3|
|2021-02-16|         10443|   4|
|2021-02-14|         10249|   5|
|2021-02-18|          9625|   6|
|2021-02-15|          9197|   7|
|2021-02-20|          8502|   8|
|2021-02-23|          8417|   9|
|2021-02-19|          8204|  10|
+----------+--------------+----+



In [140]:
# Utilizo la misma función analítica row_number, en esta ocasión para filtrar
# por el usuario que más tweets realizó por cada fecha
most_active_users_per_date = date_activity \
    .groupBy("date", "username") \
    .agg(F.sum("count").alias("user_activity")) \
    .withColumn("rank", F.row_number().over(Window.partitionBy("date").orderBy(F.desc("user_activity")))) \
    .filter("rank = 1")

# Realizo un join, respetando el orden de mi top10 de fechas con más actividad
# mencionando al usuario que más tweets realizó en cada fecha
list_top10 = top_10_dates.join(most_active_users_per_date, on=['date'], how='left').select('date', 'username')
export = list_top10.rdd.map(lambda row: (row['date'], row['username'])).collect()
export

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

# Solución problema 2:


1.   10 Emojis más usados
2.   Conteo de cada emoji



In [65]:
q2_data = rdata.select('content')

q2_data.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|content                                                                                                                                                                                                                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|The world progresses while the Indian police and Govt are still tr

In [84]:
# Obtener una lista con todos los tweets
tweets = q2_data.select(q2_data.content).rdd.map(lambda x: x[0]).collect()

tweets

['The world progresses while the Indian police and Govt are still trying to take India back to the horrific past through its tyranny. \n\n@narendramodi @DelhiPolice Shame on you. \n\n#ModiDontSellFarmers \n#FarmersProtest \n#FreeNodeepKaur https://t.co/es3kn0IQAF',
 "#FarmersProtest \n#ModiIgnoringFarmersDeaths \n#ModiDontSellFarmers \n@Kisanektamorcha \nFarmers constantly distroying crops throughout India. \nReally, it's hearts breaking...we care about our crops like our children. And govt. agriculture minister is laughing on us🚜🌾WE WILL WIN💪 https://t.co/kLspngG9xE",
 "ਪੈਟਰੋਲ ਦੀਆਂ ਕੀਮਤਾਂ ਨੂੰ ਮੱਦੇਨਜ਼ਰ ਰੱਖਦੇ ਹੋਏ \nਮੇਰੇ ਹਿਸਾਬ ਨਾਲ ਬਾਹਰ(ਪ੍ਰਦੇਸ਼) ਜਾਣ ਨਾਲੋਂ ਬਿਹਤਰ ਆ ਭਾਰਤ 'ਚ ਪੈਟਰੋਲ ਪੰਪ ਪਾ ਲਈਏ। 🤫🤫🤔🤔\n#FarmersProtest",
 '@ReallySwara @rohini_sgh watch full video here https://t.co/wBPNdJdB0n\n#farmersprotest #NoFarmersNoFood https://t.co/fUsTOKOcXK',
 '#KisanEktaMorcha #FarmersProtest #NoFarmersNoFood https://t.co/g9TYYBHQRH',
 'Jai jwaan jai kissan #FarmersProtest #ModiIgnoringFarmersDeaths htt

In [88]:
# Utilizo la librería advertools y la función extract_emoji
content_list = adv.extract_emoji(tweets)

# Utilizo el output anterior para filtrar los 10 emojis más usados
content_list['top_emoji'][:10]

[('🙏', 5049),
 ('😂', 3072),
 ('🚜', 2972),
 ('🌾', 2182),
 ('🇮🇳', 2086),
 ('🤣', 1668),
 ('✊', 1651),
 ('❤️', 1382),
 ('🙏🏻', 1317),
 ('💚', 1040)]

# Solución problema 3:

In [119]:
# Filtro por las columnas que necesito para este problema
q3_data = rdata.select('mentionedUsers.username')
q3_data.show(truncate=False)

+---------------------------+
|username                   |
+---------------------------+
|[narendramodi, DelhiPolice]|
|[Kisanektamorcha]          |
|NULL                       |
|[ReallySwara, rohini_sgh]  |
|NULL                       |
|NULL                       |
|NULL                       |
|NULL                       |
|[mandeeppunia1]            |
|NULL                       |
|NULL                       |
|[mandeeppunia1]            |
|NULL                       |
|NULL                       |
|NULL                       |
|NULL                       |
|[akshaykumar]              |
|NULL                       |
|[taapsee]                  |
|NULL                       |
+---------------------------+
only showing top 20 rows



In [120]:
# Encuentro que tengo un DataFrame del tipo array<string>
# que a su vez tiene datos nullos. Elimino los nulos.
df_filtered = q3_data.filter(col("username").isNotNull())
df_filtered.show(truncate=False)
df_filtered

+------------------------------------------+
|username                                  |
+------------------------------------------+
|[narendramodi, DelhiPolice]               |
|[Kisanektamorcha]                         |
|[ReallySwara, rohini_sgh]                 |
|[mandeeppunia1]                           |
|[mandeeppunia1]                           |
|[akshaykumar]                             |
|[taapsee]                                 |
|[PetroleumMin, PMOIndia]                  |
|[ArmaanMalik22]                           |
|[akshaykumar]                             |
|[taapsee]                                 |
|[ShekharGupta, khanthefatima, MainaBismee]|
|[nsitharaman]                             |
|[AmanJha0508, diljitdosanjh]              |
|[TheeraSingh]                             |
|[sarahwoodwriter, vivianavigil]           |
|[punjabisath1]                            |
|[ZeeNews, aajtak, republic, TimesNow]     |
|[Tractor2twitr, CoryBooker, SenBooker]    |
|[mandeep_

DataFrame[username: array<string>]

In [133]:
# Necesito separar los strings dentro de los arrays
# para luego contar las ocurrencias.
exploded_df = df_filtered.select(explode(df_filtered.username).alias("username"))\
  .groupBy("username").agg(count("*").alias("count")).orderBy(F.desc('count')).limit(10)

exploded_df

data_tuples = exploded_df.rdd.map(lambda row: (row['username'], row['count'])).collect()

In [134]:
data_tuples

[('narendramodi', 2265),
 ('Kisanektamorcha', 1840),
 ('RakeshTikaitBKU', 1644),
 ('PMOIndia', 1427),
 ('RahulGandhi', 1146),
 ('GretaThunberg', 1048),
 ('RaviSinghKA', 1019),
 ('rihanna', 986),
 ('UNHumanRights', 962),
 ('meenaharris', 926)]