In [1]:
!pip install kafka-python
!pip install redis



In [2]:
import os
import requests
import json
import redis
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime
import uuid

In [4]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--master local[2] pyspark-shell'
conf = SparkConf().set("spark.jars","/home/jovyan/work/spark-streaming-kafka-0-8-assembly_2.11-2.4.4.jar")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
ssc = StreamingContext(sc,60) # 60 is the batch interval :)
debug = True
saveToLocalDisk = True
spark = SparkSession(sc)

In [5]:
past_data = spark.read.option("header", "false").csv("/home/jovyan/work/NewsData.csv/*.csv")
past_data.show(10)

+-------------+--------------------+--------------------+--------------------+
|          _c0|                 _c1|                 _c2|                 _c3|
+-------------+--------------------+--------------------+--------------------+
|1.577809259E9|401(k) changes co...|https://finance.y...|These provisions ...|
|1.577979776E9|Aeroflot Named Wo...|https://www.nasda...|(RTTNews) - Aerof...|
|1.577979703E9|Jumia Faces an Un...|https://www.nasda...|Jumia Technologie...|
|1.577979479E9|BUZZ-U.S. STOCKS ...|https://www.nasda...|A compilation of ...|
|1.577979472E9|Argentina's Ferna...|https://www.nasda...|Argentina's new P...|
|1.577979174E9|Thursday 1/2 Insi...|https://www.nasda...|As the saying goe...|
|1.577979148E9|First Week of Aug...|https://www.nasda...|Investors in Kilr...|
|1.577979092E9|The Biggest Influ...|https://www.nasda...|The new year is a...|
|1.577979081E9|Nasdaq 100 Movers...|https://www.nasda...|In early trading ...|
| 1.57797906E9|4 Retirement Plan...|https://www.nasd

In [116]:
from pyspark.sql import Window,WindowSpec;
from pyspark.sql.functions import *

window=Window.orderBy(col("Timestamp").desc());
df = past_data.toDF("Timestamp","Title","Link","Summary1");
ranked = df.withColumn("Rank",dense_rank().over(window)).cache().drop_duplicates();
ranked.show()
#WindowSpec window=Window.partitionBy(col("dep")).orderBy(col("sal").desc());
#Column column_dense_rank=dense_rank().over(window);
#df.select(col("name"),col("dep"),col("sal"),column_dense_rank.as("rank")).where(col("rank").leq(2)).show();

+-------------+--------------------+--------------------+--------------------+----+
|    Timestamp|               Title|                Link|            Summary1|Rank|
+-------------+--------------------+--------------------+--------------------+----+
|1.578196637E9|The Week Ahead – ...|https://www.nasda...|FXEmpire.com - On...|   1|
|  1.5781932E9|Why Huazhu Group ...|https://www.nasda...|What happened\nSh...|   2|
|  1.5781896E9|Why Baozun Stock ...|https://www.nasda...|What happened\nSh...|   3|
|  1.5781887E9|Why iQiyi Stock C...|https://www.nasda...|What happened\nSh...|   4|
|  1.5781878E9|Why Elastic Stock...|https://www.nasda...|What happened\nSh...|   5|
|   1.578186E9|Here's Why Mirati...|https://www.nasda...|What happened\nSh...|   6|
|  1.5781869E9|Here's Why Shares...|https://www.nasda...|What happened\nBo...|   7|
|  1.5781848E9|Fed Has Many Tool...|https://www.wsj.c...|The Federal Reser...|   8|
| 1.57818048E9|Why Your 2020 Res...|https://www.nasda...|As has been the c..

In [133]:
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, MapType

def get_bucket(rank):
    return math.floor(rank/10)

def get_json(summary, title, link, timestamp):
    #return "{\"Summary\":\""+str(summary)+"\",\"Title\":\""+str(title)+"\",\"Link\":\""+str(link)+"\",\"Timestamp\":\""+str(timestamp)+"\"}"
    return {"Summary":summary, "Title":title, "Link":link, "Timestamp":timestamp}

def get_unicode_escaped_string(text):
    if(text is None):
        return ""
    else:
        return text.encode("ascii").decode("unicode_escape")

get_bucket_udf = udf(get_bucket, IntegerType())
get_json_udf = udf(get_json, MapType(StringType(),StringType()))
get_unicode_escaped_string_udf = udf(get_unicode_escaped_string, StringType())

branked = ranked.withColumn("Bucket", get_bucket_udf('Rank')) \
                .withColumn("Summary", get_unicode_escaped_string_udf("Summary1")) \
                .withColumn("JSON",get_json_udf("Summary","Title","Link","Timestamp")) \
                .groupBy("Bucket").agg(collect_list("JSON").alias("Dictionary"))
branked.select("Dictionary").show()

+--------------------+
|          Dictionary|
+--------------------+
|[[Title -> Bankru...|
|[[Title -> How Bu...|
|[[Title -> The Hi...|
|[[Title -> Thursd...|
|[[Title -> Treasu...|
|[[Title -> Top 10...|
|[[Title -> Chines...|
|[[Title -> Turkis...|
|[[Title -> CES 20...|
|[[Title -> Japane...|
|[[Title -> Mexica...|
|[[Title -> China ...|
|[[Title -> Study ...|
|[[Title -> Lower-...|
|[[Title -> It's t...|
|[[Title -> The We...|
+--------------------+



In [134]:
def SetVal(x):
    r = redis.StrictRedis(host = 'redis-store', port = 6379)
    idVal = str(uuid.uuid4())
    if (r.get('LatestNews') != None):
        try:
            batchSet = json.loads(r.get('LatestNews'))
            if ("id" in batchSet):
                r.set('LatestNews',"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\""+batchSet["id"]+"\"}")
                r.set(idVal,"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\""+batchSet["id"]+"\"}")
            else:
                r.set('LatestNews',"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
                r.set(idVal,"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
        except Exception:
            r.set('LatestNews',"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
            r.set(idVal,"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
    else:
        r.set('LatestNews',"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
        r.set(idVal,"{\"tasks\":"+json.dumps(x)+", \"id\":\""+idVal+"\", \"next\":\"\"}")
    
for val in branked.select("Dictionary").rdd.collect():
    SetVal([item for sublist in val for item in sublist])

In [135]:
r = redis.StrictRedis(host = 'redis-store', port = 6379)
print(r.get('LatestNews'))

b'{"tasks":[{"Title": "The Week Ahead \\u2013 Stats, Impeachment, and Brexit to Take a Back Seat as Iran Takes Center Stage", "Summary": "FXEmpire.com - On the Macro\\nIt\\u2019s a busy week ahead on the economic calendar, with 51 stats to monitor. In the previous week, just 27 stats had been in focus.", "Timestamp": "1.578196637E9", "Link": "https://www.nasdaq.com/articles/the-week-ahead-stats-impeachment-and-brexit-to-take-a-back-seat-as-iran-takes-center-stage"}, {"Title": "Why Huazhu Group Stock Climbed 17.1% in December", "Summary": "What happened\\nShares of Huazhu Group\\u00a0(NASDAQ: HTHT) rose 17.1% in December, according to data from\\u00a0S&P Global Market Intelligence\\u00a0. The hotel company\'s stock posted big gains as the broader Chinese stock market advanced thanks to an improved trade outlook.", "Timestamp": "1.5781932E9", "Link": "https://www.nasdaq.com/articles/why-huazhu-group-stock-climbed-17.1-in-december-2020-01-05"}, {"Title": "Why Baozun Stock Fell 12.8% in De

In [60]:
branked.select("Dictionary").rdd.flatMap(list).collect()

[['{"Summary":"A California bankruptcy court has sided with electric utility PG&E in its fight with bondholders over the interest rate that it must pay on its debts while under bankruptcy court protection.","Title":"Bankruptcy Court Rules Against PG&E Bondholders in Interest-Rate Fight","Link":"https://www.wsj.com/articles/pg-e-bankruptcy-court-rules-against-bondholders-in-interest-rate-fight-11577829961","Timestamp":"1.57789242E9"}',
  '{"Summary":"A California bankruptcy court has sided with electric utility PG&E in its fight with bondholders over the interest rate that it must pay on its debts while under bankruptcy court protection.","Title":"Bankruptcy Court Rules Against PG&E Bondholders in Interest-Rate Fight","Link":"https://www.wsj.com/articles/pg-e-bankruptcy-court-rules-against-bondholders-in-interest-rate-fight-11577829961","Timestamp":"1.57789242E9"}',
  '{"Summary":"When unemployment goes down, inflation picks up, and vice versa. That has been a central tenet of economics

In [91]:
for task in json.loads(r.get('LatestNews'))["tasks"]:
    print(task["Summary"]+"\n")

TypeError: list indices must be integers or slices, not str

In [89]:
batchSet = json.loads(r.get('LatestNews'))
if("id" in batchSet):
    print("Success:"+batchSet["id"])
else:
    print("Failure")
        

Success:835e4008-1614-443e-8f79-cfe482b7743c


In [129]:
text = "The Week Ahead \\u2013 Stats, Impeachment"

In [132]:
text.encode("ascii").decode("unicode_escape")

'The Week Ahead – Stats, Impeachment'