In [1]:
sc

In [4]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.functions import *

mongo_uri = "mongodb://hadoop-vm.internal.cloudapp.net:27017/ca2.tweets"

# Spark version 3.2.3
# MongoDB version 6.0.5
# Java Version 11

# create a spark session
spark = SparkSession.builder \
    .appName('Tweets') \
    .config("spark.mongodb.read.connection.uri", mongo_uri) \
    .config("spark.mongodb.write.connection.uri", mongo_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1,org.mongodb:mongodb-driver-core:4.9.1,org.mongodb:bson:4.9.1,org.mongodb:mongodb-driver-core:4.9.1,org.mongodb:mongodb-driver-sync:4.9.1") \
    .getOrCreate()

# read data from mongodb collection "tweets" into a dataframe "df"
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "tweets") \
    .load()

df.printSchema()

{"@version":1,"source_host":"rmsryu-vm","message":"MongoClient with metadata {\"driver\": {\"name\": \"mongo-java-driver|sync|mongo-spark-connector|source\", \"version\": \"4.9.1|10.1.1\"}, \"os\": {\"type\": \"Linux\", \"name\": \"Linux\", \"architecture\": \"amd64\", \"version\": \"5.15.0-1035-azure\"}, \"platform\": \"Java/Ubuntu/11.0.18+10-post-Ubuntu-0ubuntu120.04.1|Scala/2.12.15/Spark/3.2.3\"} created with settings MongoClientSettings{readPreference=primary, writeConcern=WriteConcern{w=null, wTimeout=null ms, journal=null}, retryWrites=true, retryReads=true, readConcern=ReadConcern{level=null}, credential=null, streamFactoryFactory=null, commandListeners=[], codecRegistry=ProvidersCodecRegistry{codecProviders=[ValueCodecProvider{}, BsonValueCodecProvider{}, DBRefCodecProvider{}, DBObjectCodecProvider{}, DocumentCodecProvider{}, CollectionCodecProvider{}, IterableCodecProvider{}, MapCodecProvider{}, GeoJsonCodecProvider{}, GridFSFileCodecProvider{}, Jsr310CodecProvider{}, JsonObje

# Create a timeseries collection in Mongo DB


## Step 1: Mongo shell create timeseries_tweets collection
```javascript
use ca2;

db.createCollection("timeseries_tweets", {
  "timeseries": {
    "timeField": "timestamp",
    "metaField": null, // Set to null if you don't have any metadata fields
    "granularity": "seconds" // Choose the appropriate granularity, e.g., seconds, minutes, hours
  }
});
```


## Step 2: On existing tweets collection create a new timestamp field 

```javascript
db.tweets.find().forEach(function (doc) {
  var timestamp = new Date(Number(doc.timestamp_ms));
  var updateDoc = { $set: { "timestamp": timestamp } };
  db.tweets.updateOne({ "_id": doc._id }, updateDoc);
});
```



## Step 3: Migrate tweets collection to timeseries_tweets

```javascript
db.tweets.find().forEach(function (doc) {
  db.timeseries_tweets.insertOne(doc);
});

```

In [None]:
# load timeseries_tweets collection
df = spark.read \
    .format("mongodb") \
    .option("connection.uri", mongo_uri) \
    .option("database", "ca2") \
    .option("collection", "timeseries_tweets") \
    .load()

df.printSchema()