<a href="https://colab.research.google.com/github/skyprince999/100-Days-Of-ML/blob/master/Day%2029%23%20Hydrating_Streaming_AWS_Kinesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tweet IDs from the repository are hydrated and instead of storing to disk are piped to a AWS Kinesis data-stream. 

From there they are read into a Spark streaming context for processing.

The following attributes are extracted from the tweets -

1. User Id
2. User mentions
3. hashtags
4. full text of tweet
5. sentiment analysis using text-blob



In [None]:
!git clone https://github.com/echen102/COVID-19-TweetIDs

In [None]:
#Twarc is used to hydrate the tweets- don't run this
!pip install twarc
!twarc configure

In [None]:
 
import boto3
import json
import time

from random import random

In [None]:
import gzip
import json

from tqdm import tqdm
from twarc import Twarc
from pathlib import Path

twarc = Twarc()
data_dirs = ['COVID-19-TweetIDs/2020-01', 'COVID-19-TweetIDs/2020-02', 'COVID-19-TweetIDs/2020-03', 
             'COVID-19-TweetIDs/2020-04', 'COVID-19-TweetIDs/2020-05', 'COVID-19-TweetIDs/2020-06']    
    

In [None]:
def main():
    for data_dir in data_dirs:
        for path in Path(data_dir).iterdir():
            if path.name.endswith('.txt'):
                hydrate(path)


In [None]:
def _reader_generator(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024 * 1024)


In [None]:
def raw_newline_count(fname):
    """
    Counts number of lines in file
    """
    f = open(fname, 'rb')
    f_gen = _reader_generator(f.raw.read)
    return sum(buf.count(b'\n') for buf in f_gen)


In [None]:
def get_record(tweet):
  """
  Return data bytes
  """
  record = json.dumps(tweet)

  return {'Data': bytes(record, 'utf-8') }  # << Use this if sending to a Firehose
  #return [{'Data': bytes(record, 'utf-8'), 'PartitionKey': 'partition_key'}] # << This isto be used if passing the data to a Kinesis DataStream

In [None]:
def hydrate(id_file):
    # create kinesis client connection
    kinesis_client = boto3.client('firehose', # Change this to kinesis if you are using a Data stream 
                                  region_name='us-east-1',  # enter the region
                                  aws_access_key_id='##########################',  # fill your AWS access key id
                                  aws_secret_access_key='######################################')  # fill you aws secret access key

    print('hydrating {}'.format(id_file))

    gzip_path = id_file.with_suffix('.jsonl.gz')
    if gzip_path.is_file():
        print('skipping json file already exists: {}'.format(gzip_path))
        return

    num_ids = raw_newline_count(id_file)

    with gzip.open(gzip_path, 'w') as output:
        with tqdm(total=num_ids) as pbar:
            for tweet in twarc.hydrate(id_file.open()):
                output.write(json.dumps(tweet).encode('utf8') + b"\n")
                record = get_record(tweet)
                #kinesis_client.put_record(StreamName="covid-stream", Records= record) # << This is for a Kinesis Data Stream
                kinesis_client.put_record(DeliveryStreamName="covid-stream", Record= record) # << This is for a Firehose
                pbar.update(1)


In [None]:
if __name__ == "__main__":
    main()

In [None]:
# !rm -f COVID-19-TweetIDs/2020-01/coronavirus-tweet-id-2020-01-23-04.jsonl.gz
# !ls COVID-19-TweetIDs/2020-01/coronavirus-tweet-id-2020-01-23-04.*

In [None]:
!ls COVID-19-TweetIDs/2020-01/*.jsonl*
