In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains

### Connect to MongoDB

In [None]:
USERNAME = PASSWORD = 'sri'
CNCT_STR = f'mongodb+srv://{USERNAME}:{PASSWORD}@svp-cluster.1uzpyjf.mongodb.net/svp_database.video_tags_json?retryWrites=true'
FORMAT = 'com.mongodb.spark.sql.DefaultSource'

In [None]:
spark = SparkSession \
        .builder \
        .appName('spark_video_processing') \
        .master('local')\
        .config('spark.mongodb.input.uri', CNCT_STR) \
        .config('spark.mongodb.output.uri', CNCT_STR) \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
        .getOrCreate()

### Read from MongoDB

In [None]:
video_tags = spark.read \
             .format(FORMAT) \
             .option('uri', CNCT_STR) \
             .load()

In [None]:
video_tags.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- video_id: string (nullable = true)
 |-- video_path: string (nullable = true)



In [None]:
video_tags.show()

+--------------------+--------------------+-----------+--------------------+
|                 _id|                tags|   video_id|          video_path|
+--------------------+--------------------+-----------+--------------------+
|[638824df291310e6...|[filing cabinet, ...|9dWxCvcK6H4|data/v/9dWxCvcK6H...|
|[638824df291310e6...|[baby bib, oxygen...|a1zf0TQcdFQ|data/v/a1zf0TQcdF...|
|[638824df291310e6...|[combine harveste...|dnzvSdAF9Ns|data/v/dnzvSdAF9N...|
|[638824df291310e6...|[barometer, spira...|gCQ42PwhZCc|data/v/gCQ42PwhZC...|
|[638824df291310e6...|[European green l...|nDGcDKA30W0|data/v/nDGcDKA30W...|
|[638824df291310e6...|[killer whale, ca...|sHn8LBYNm_o|data/v/sHn8LBYNm_...|
|[638824df291310e6...|[television, watc...|sxBGkWEywxU|data/v/sxBGkWEywx...|
|[638824df291310e6...|[punching bag, ca...|-aQWllQVgac|data/v/-aQWllQVga...|
|[638824df291310e6...|[cardboard box / ...|8RWwmPCSoqc|data/v/8RWwmPCSoq...|
|[638824df291310e6...|[mobile home, pon...|96DHjKlLJUQ|data/v/96DHjKlLJU...|

In [None]:
QUERY_TAG = 'soccer ball'
df = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG)).toPandas()
df

Unnamed: 0,_id,tags,video_id,video_path
0,"(638824df291310e6c7f06b2b,)","[soccer ball, swing, juggling soccer ball, pas...",IF1D28mjfqQ,data/v/IF1D28mjfqQ.mp4
1,"(638824df291310e6c7f06b87,)","[soccer ball, passing soccer ball]",ID7rAWGNR6Q,data/v/ID7rAWGNR6Q.mp4
2,"(638824df291310e6c7f06c41,)","[soccer ball, suit, velvet fabric, chopping me...",xeaQX9lvAMo,data/v/xeaQX9lvAMo.mp4
3,"(638824df291310e6c7f06cf9,)","[racket, soccer ball, swing, volleyball, shoot...",upaGn7gC8bo,data/v/upaGn7gC8bo.mp4


In [None]:
QUERY_TAG = 'throwing snowballs'
df = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG)).toPandas()
df

Unnamed: 0,_id,tags,video_id,video_path
0,"(638824df291310e6c7f06b84,)","[bikini, movie theater, front curtain, belly d...",hqs8a5e7kPs,data/v/hqs8a5e7kPs.mp4
1,"(638824df291310e6c7f06bb9,)","[car mirror, seat belt, sleeping bag, beatboxi...",sgx9W6VGJjk,data/v/sgx9W6VGJjk.mp4
2,"(638824df291310e6c7f06c38,)","[backpack, candle, cliff, base jumping, diving...",Wd47jL6Hwn4,data/v/Wd47jL6Hwn4.mp4


#### Conditional queries

#### Conditional OR

In [None]:
QUERY_TAG1 = 'car mirror'
QUERY_TAG2 = 'driving car'
df1 = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG1))
df2 = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG2))
df = df1.union(df2).toPandas()
df

Unnamed: 0,_id,tags,video_id,video_path
0,"(638824df291310e6c7f06b24,)","[killer whale, car mirror, hook, pajamas, soap...",sHn8LBYNm_o,data/v/sHn8LBYNm_o.mp4
1,"(638824df291310e6c7f06b31,)","[car mirror, measuring cup, guacamole, cabbage...",ffrLyIKGAjE,data/v/ffrLyIKGAjE.mp4
2,"(638824df291310e6c7f06b67,)","[high-speed train, car mirror, microwave oven,...",-8zyilOZHLU,data/v/-8zyilOZHLU.mp4
3,"(638824df291310e6c7f06b80,)","[car mirror, car wheel, hair spray, jeep, prin...",Qtu7C8_kl5I,data/v/Qtu7C8_kl5I.mp4
4,"(638824df291310e6c7f06b93,)","[car mirror, eating burger, hoverboarding, put...",I4K2G76pdpo,data/v/I4K2G76pdpo.mp4
5,"(638824df291310e6c7f06b95,)","[killer whale, car mirror, paddle, motorboat, ...",n1KCf9ijDD4,data/v/n1KCf9ijDD4.mp4
6,"(638824df291310e6c7f06b9f,)","[car mirror, cliff, blasting sand, diving clif...",clofw7v0tfo,data/v/clofw7v0tfo.mp4
7,"(638824df291310e6c7f06bb9,)","[car mirror, seat belt, sleeping bag, beatboxi...",sgx9W6VGJjk,data/v/sgx9W6VGJjk.mp4
8,"(638824df291310e6c7f06bcf,)","[car mirror, tray, consomme, exercising arm, p...",jK0lPR3Djjo,data/v/jK0lPR3Djjo.mp4
9,"(638824df291310e6c7f06be0,)","[electric ray, flatworm, chiton, car mirror, a...",yjfEJVXn9A0,data/v/yjfEJVXn9A0.mp4


#### Conditional AND

In [None]:
QUERY_TAG1 = 'car mirror'
QUERY_TAG2 = 'driving car'
df1 = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG1))
df2 = video_tags.filter(array_contains(video_tags.tags, QUERY_TAG2))
df = df1.intersect(df2).toPandas()
print([f'https://www.youtube.com/watch?v={v_id}' for v_id in df['video_id'].tolist()])
df

['https://www.youtube.com/watch?v=1kvdhlA_BXY', 'https://www.youtube.com/watch?v=sHn8LBYNm_o', 'https://www.youtube.com/watch?v=QamPzj0Xzek', 'https://www.youtube.com/watch?v=SCpXjVRQ3sI']


Unnamed: 0,_id,tags,video_id,video_path
0,"(638824df291310e6c7f06cba,)","[car mirror, breathing fire, driving car, news...",1kvdhlA_BXY,data/v/1kvdhlA_BXY.mp4
1,"(638824df291310e6c7f06b24,)","[killer whale, car mirror, hook, pajamas, soap...",sHn8LBYNm_o,data/v/sHn8LBYNm_o.mp4
2,"(638824df291310e6c7f06bf5,)","[car mirror, changing gear in car, driving car...",QamPzj0Xzek,data/v/QamPzj0Xzek.mp4
3,"(638824df291310e6c7f06c2b,)","[car mirror, minivan, necklace, changing gear ...",SCpXjVRQ3sI,data/v/SCpXjVRQ3sI.mp4


car mirror,breathing fire,driving car,news anchoring,smoking,smoking pipe; 1kvdhlA_BXY

car mirror,changing gear in car,driving car,pushing car,vacuuming car; QamPzj0Xzek