In [1]:
from pyspark import SparkContext, SparkConf # Spark
from pyspark.sql import SparkSession # Spark SQL
from pyspark.sql.types import *

#additional 
from pyspark.sql.functions import *

sc = SparkContext.getOrCreate()

# local[*]: run Spark locally with as many working processors as logical cores on your machine.
# In the field of `master`, we use a local server with as many working processors (or threads) as possible (i.e. `local[*]`). 
# If we want Spark to run locally with 'k' worker threads, we can spcecify as `local[k]`.
# The `appName` field is a name to be shown on the Sparking cluster UI. 

# If there is no existing spark context, we now create a new context
if (sc is None):
    sc = SparkContext(master="local[3]", appName="Introduction to Apache Spark")
spark = SparkSession(sparkContext=sc)


In [2]:
from kafka import KafkaConsumer
from json import loads
import matplotlib.pyplot as plt


# kafka consumer connect
consumer = KafkaConsumer(
    'dfTest',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='my_group',
    value_deserializer=lambda x: loads(x.decode('utf-8')))

In [3]:
%matplotlib inline
from random import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import numpy as np
from numpy import polyfit
from pyspark.sql.types import *

import matplotlib
matplotlib.use('TkAgg')

period = []
window_size = 30
total_time = 0
#AU = []
JPY = []

plt.rcParams['animation.html'] = 'jshtml'
fig = plt.figure()
ax = fig.add_subplot(111)
fig.show()

# 5 second move graph
term = 5

# Make temporary schema for data from producer
dataSchema = StructType([StructField("JPY", FloatType()) \
                        ,StructField("period", IntegerType())])

for message in consumer:
    message = message.value
    print(message['row'])
    
    #AU.append(float(message['row'].split(' ')[1]))
    JPY.append(float(message['row'].split(' ')[2]))
    period.append(total_time)
    
    if total_time >= window_size:
        
        if total_time % term == 0:
            
            # Make Pyspark Dataframe
            temp_list = []
            data_list = []

            for i in range(len(JPY)):
                temp_list.append(JPY[i])
                temp_list.append(period[i])
                data_list.append(temp_list)

                temp_list = []

            df = spark.createDataFrame(data_list, schema = dataSchema)
            df=df.select(df.period,df.JPY.alias('label'))

            # Split Dataset training : 70%, test : 30%
            training,test = df.randomSplit([0.7,0.3],seed = 100)

            # Make Linear Regression Model
            assembler = VectorAssembler().setInputCols(['period',]).setOutputCol('features')

            trainingSet = assembler.transform(training)
            trainingSet.show()
            trainingSet2 = trainingSet.select("features","label")
            #trainingSet2.show(truncate=False)
            # Define Linear Regression training step with 10 iterations
            lr = LinearRegression(maxIter=10)
            lr_Model = lr.fit(trainingSet2)

            testSet1 = assembler.transform(test)
            testSet2 = testSet1.select("features","label")

            # Test the model using testing dataset
            testSet3 = lr_Model.transform(testSet2)

            result = testSet3.select("features","label","prediction",(testSet3.prediction - testSet3.label).alias("err"))
            result.show(truncate=False)
            
            training_p = []
            training_j = []
            for i in range(9):
                num = randint(0, 29)
                training_p.append(period[num])
                training_j.append(JPY[num])

            # Make Graph
            plt.scatter(period, JPY, c = 'C0', label = 'Linear Regression')
            plt.xlabel("Time")
            plt.ylabel("JPY")
            p1 = polyfit(period,JPY, 1)
            #plt.plot(period, np.polyval(p1, period), 'g-')
            ax.clear()
            ax.scatter(period, JPY, color='g')
            ax.scatter(training_p, training_j, color='r')
            plt.plot(period, np.polyval(p1, period), 'g-')
            #ax.scatter(period, np.polyval(p1, period), 'g-')
            fig.canvas.draw()

        period.pop(0)
        JPY.pop(0)
            
    total_time = total_time + 1

4/3/1995 0.7343 87.2
4/4/1995 0.7324 86.0
4/5/1995 0.7387 86.4
4/6/1995 0.7393 86.1
4/7/1995 0.7683 84.6
4/10/1995 0.7402 83.0
4/11/1995 0.7391 84.4
4/12/1995 0.7423 83.4
4/13/1995 0.742 83.45
4/14/1995 0.7683 83.8
4/17/1995 0.7683 82.3
4/18/1995 0.7456 81.8
4/19/1995 0.7407 80.3
4/20/1995 0.7357 81.6
4/21/1995 0.7363 83.3
4/24/1995 0.7683 82.35
4/25/1995 0.7683 83.3
4/26/1995 0.7683 82.6
4/27/1995 0.7271 83.7
4/28/1995 0.7299 83.75
5/1/1995 0.7282 84.0
5/2/1995 0.7285 83.7
5/3/1995 0.7295 107.97
5/4/1995 0.7348 107.97
5/5/1995 0.7426 107.97
5/8/1995 0.7411 83.1
5/9/1995 0.7382 83.3
5/10/1995 0.7286 83.4
5/11/1995 0.7295 83.95
5/12/1995 0.735 85.85
1/2/1995 0.7683 107.97
+------+------+--------+
|period| label|features|
+------+------+--------+
|     0|  87.2|   [0.0]|
|     1|  86.0|   [1.0]|
|     2|  86.4|   [2.0]|
|     3|  86.1|   [3.0]|
|     5|  83.0|   [5.0]|
|     6|  84.4|   [6.0]|
|    10|  82.3|  [10.0]|
|    11|  81.8|  [11.0]|
|    12|  80.3|  [12.0]|
|    13|  81.6|  [13

+------+------+--------+
|period| label|features|
+------+------+--------+
|    25|  83.1|  [25.0]|
|    26|  83.3|  [26.0]|
|    27|  83.4|  [27.0]|
|    28| 83.95|  [28.0]|
|    30|107.97|  [30.0]|
|    31|107.97|  [31.0]|
|    35|101.05|  [35.0]|
|    36|100.18|  [36.0]|
|    37| 99.85|  [37.0]|
|    38| 100.0|  [38.0]|
|    39| 98.85|  [39.0]|
|    41|  99.0|  [41.0]|
|    42| 98.92|  [42.0]|
|    45|  99.9|  [45.0]|
|    47|  99.6|  [47.0]|
|    48| 99.53|  [48.0]|
|    49|  99.3|  [49.0]|
|    52| 99.35|  [52.0]|
|    53| 99.33|  [53.0]|
|    54| 99.55|  [54.0]|
+------+------+--------+
only showing top 20 rows

+--------+------+------------------+--------------------+
|features|label |prediction        |err                 |
+--------+------+------------------+--------------------+
|[29.0]  |85.85 |93.41748236328372 |7.567483889162631   |
|[32.0]  |100.98|94.49188686258154 |-6.488116494352056  |
|[33.0]  |101.0 |94.85002169568081 |-6.149978304319191  |
|[34.0]  |100.95|95.208156

+--------+-----+-----------------+--------------------+
|features|label|prediction       |err                 |
+--------+-----+-----------------+--------------------+
|[54.0]  |99.55|99.74916852414498|0.19916547238716475 |
|[57.0]  |99.4 |98.87992135221181|-0.5200801736670968 |
|[58.0]  |98.8 |98.59017229490075|-0.209830756857059  |
|[59.0]  |98.78|98.30042323758968|-0.47957554170719163|
|[65.0]  |96.95|96.56192889372333|-0.38806805451885396|
|[68.0]  |97.1 |95.69268172179017|-1.407316752330928  |
|[69.0]  |96.6 |95.4029326644791 |-1.1970658096419982 |
|[71.0]  |97.05|94.82343454985698|-2.226568501900829  |
|[75.0]  |93.4 |93.66443832061275|0.2644367947338395  |
|[76.0]  |92.85|93.37468926330169|0.5246907891805961  |
+--------+-----+-----------------+--------------------+

3/14/1995 0.7459 90.75
3/15/1995 0.7465 90.45
3/16/1995 0.7466 89.75
3/17/1995 0.7418 90.0
3/20/1995 0.7293 89.45
+------+-----+--------+
|period|label|features|
+------+-----+--------+
|    55|99.68|  [55.0]|
|    

4/18/1995 0.7456 81.8
4/19/1995 0.7407 80.3
4/20/1995 0.7357 81.6
4/21/1995 0.7363 83.3
4/24/1995 0.7683 82.35
+------+------+--------+
|period| label|features|
+------+------+--------+
|    80|  90.2|  [80.0]|
|    81| 90.75|  [81.0]|
|    82| 90.45|  [82.0]|
|    83| 89.75|  [83.0]|
|    85| 89.45|  [85.0]|
|    86|107.97|  [86.0]|
|    90| 88.62|  [90.0]|
|    91| 89.65|  [91.0]|
|    92|  88.5|  [92.0]|
|    93| 88.25|  [93.0]|
|    94| 89.35|  [94.0]|
|    96|  86.0|  [96.0]|
|    97|  86.4|  [97.0]|
|   100|  83.0| [100.0]|
|   102|  83.4| [102.0]|
|   103| 83.45| [103.0]|
|   104|  83.8| [104.0]|
|   107|  80.3| [107.0]|
|   108|  81.6| [108.0]|
|   109|  83.3| [109.0]|
+------+------+--------+
only showing top 20 rows

+--------+-----+-----------------+-------------------+
|features|label|prediction       |err                |
+--------+-----+-----------------+-------------------+
|[84.0]  |90.0 |91.95391338967654|1.9539133896765435 |
|[87.0]  |89.03|90.71602456520912|1.6860257

+--------+-----+-----------------+------------------+
|features|label|prediction       |err               |
+--------+-----+-----------------+------------------+
|[109.0] |83.3 |84.62507683801367|1.3250737862558566|
|[112.0] |82.6 |86.2557328349222 |3.6557343608011053|
|[113.0] |83.7 |86.7992848338917 |3.0992878856495167|
|[114.0] |83.75|87.34283683286122|3.5928368328612237|
|[120.0] |83.1 |90.60414882667828|7.50415035255719  |
|[123.0] |83.95|92.23480482358681|8.284807875344626 |
|[124.0] |85.85|92.77835682255632|6.928358348435225 |
|[126.0] |86.4 |93.86546082049534|7.465459294616437 |
|[130.0] |87.02|96.03966881637339|9.019672173306986 |
|[131.0] |87.35|96.5832208153429 |9.233222341221804 |
+--------+-----+-----------------+------------------+

1/5/1995 0.7693 101.0
1/6/1995 0.7699 100.95
1/9/1995 0.7658 101.05
1/10/1995 0.7643 100.18
1/11/1995 0.767 99.85
+------+------+--------+
|period| label|features|
+------+------+--------+
|   110| 82.35| [110.0]|
|   111|  83.3| [111.0]|
|   

2/9/1995 0.7419 98.8
2/10/1995 0.7437 98.78
2/13/1995 0.747 98.77
2/14/1995 0.7426 98.6
2/15/1995 0.7466 98.55
+------+------+--------+
|period| label|features|
+------+------+--------+
|   135|100.98| [135.0]|
|   136| 101.0| [136.0]|
|   137|100.95| [137.0]|
|   138|101.05| [138.0]|
|   140| 99.85| [140.0]|
|   141| 100.0| [141.0]|
|   145| 98.92| [145.0]|
|   146| 99.45| [146.0]|
|   147| 99.15| [147.0]|
|   148|  99.9| [148.0]|
|   149| 99.75| [149.0]|
|   151| 99.53| [151.0]|
|   152|  99.3| [152.0]|
|   155| 99.35| [155.0]|
|   157| 99.55| [157.0]|
|   158| 99.68| [158.0]|
|   159| 99.25| [159.0]|
|   162| 98.78| [162.0]|
|   163| 98.77| [163.0]|
|   164|  98.6| [164.0]|
+------+------+--------+
only showing top 20 rows

+--------+------+------------------+--------------------+
|features|label |prediction        |err                 |
+--------+------+------------------+--------------------+
|[139.0] |100.18|100.38193751337839|0.20193720820260808 |
|[142.0] |98.85 |100.1767756727

+--------+-----+-----------------+--------------------+
|features|label|prediction       |err                 |
+--------+-----+-----------------+--------------------+
|[164.0] |98.6 |98.26418150626753|-0.3358169678535603 |
|[167.0] |97.5 |97.52272551866079|0.022725518660791977|
|[168.0] |96.95|97.27557352279187|0.3255765745496859  |
|[169.0] |97.35|97.02842152692295|-0.3215769471981389 |
|[175.0] |96.5 |95.54550955170947|-0.9544904482905281 |
|[178.0] |93.4 |94.80405356410272|1.40405203822381    |
|[179.0] |92.85|94.5569015682338 |1.706903094112704   |
|[181.0] |91.1 |94.06259757649596|2.962599102374867   |
|[185.0] |90.45|93.07398959302031|2.623992644778127   |
|[186.0] |89.75|92.82683759715138|3.076837597151382   |
+--------+-----+-----------------+--------------------+

3/23/1995 0.7226 88.7
3/24/1995 0.7265 88.55
3/27/1995 0.7275 88.62
3/28/1995 0.7268 89.65
3/29/1995 0.7271 88.5
+------+------+--------+
|period| label|features|
+------+------+--------+
|   165| 98.55| [165.0]|
| 

4/28/1995 0.7299 83.75
5/1/1995 0.7282 84.0
5/2/1995 0.7285 83.7
5/3/1995 0.7295 107.97
+------+-----+--------+
|period|label|features|
+------+-----+--------+
|   190|89.03| [190.0]|
|   191| 88.7| [191.0]|
|   192|88.55| [192.0]|
|   193|88.62| [193.0]|
|   195| 88.5| [195.0]|
|   196|88.25| [196.0]|
|   200| 86.4| [200.0]|
|   201| 86.1| [201.0]|
|   202| 84.6| [202.0]|
|   203| 83.0| [203.0]|
|   204| 84.4| [204.0]|
|   206|83.45| [206.0]|
|   207| 83.8| [207.0]|
|   210| 80.3| [210.0]|
|   212| 83.3| [212.0]|
|   213|82.35| [213.0]|
|   214| 83.3| [214.0]|
|   217|83.75| [217.0]|
|   218| 84.0| [218.0]|
|   219| 83.7| [219.0]|
+------+-----+--------+
only showing top 20 rows

+--------+-----+-----------------+-------------------+
|features|label|prediction       |err                |
+--------+-----+-----------------+-------------------+
|[194.0] |89.65|86.55271068268084|-3.097290843198067 |
|[197.0] |89.35|86.48017465195493|-2.8698238221661683|
|[198.0] |87.2 |86.45599597504628|-

+--------+------+-----------------+---------------------+
|features|label |prediction       |err                  |
+--------+------+-----------------+---------------------+
|[219.0] |83.7  |89.17835745271111|5.478360504468924    |
|[222.0] |107.97|88.57068686988967|-19.399314350813455  |
|[223.0] |83.1  |88.36813000894918|5.268131534828086    |
|[224.0] |83.3  |88.16557314800869|4.865570096250877    |
|[230.0] |86.3  |86.95023198236578|0.650228930607966    |
|[233.0] |87.02 |86.34256139954434|-0.6774352435220692  |
|[234.0] |87.35 |86.14000453860385|-1.2099939355172467  |
|[236.0] |86.35 |85.73489081672287|-0.6151076573982266  |
|[240.0] |83.2  |84.92466337296094|1.7246664247187482   |
|[241.0] |84.75 |84.72210651202045|-0.027893487979554266|
+--------+------+-----------------+---------------------+

6/8/1995 0.7204 85.1
6/9/1995 0.7206 84.75
6/12/1995 0.7683 84.3
6/13/1995 0.7174 83.8
6/14/1995 0.7214 84.6
+------+------+--------+
|period| label|features|
+------+------+--------+
|  

7/14/1995 0.7302 87.8
7/17/1995 0.7309 88.7
7/18/1995 0.7333 88.75
7/19/1995 0.7319 87.5
+------+-----+--------+
|period|label|features|
+------+-----+--------+
|   245| 84.7| [245.0]|
|   246| 85.1| [246.0]|
|   247|84.75| [247.0]|
|   248| 84.3| [248.0]|
|   250| 84.6| [250.0]|
|   251| 84.4| [251.0]|
|   255| 84.4| [255.0]|
|   256| 84.0| [256.0]|
|   257| 84.5| [257.0]|
|   258|84.25| [258.0]|
|   259|84.25| [259.0]|
|   261| 85.4| [261.0]|
|   262| 84.6| [262.0]|
|   265|84.85| [265.0]|
|   267| 85.8| [267.0]|
|   268|87.45| [268.0]|
|   269|87.07| [269.0]|
|   272| 87.8| [272.0]|
|   273| 88.7| [273.0]|
|   274|88.75| [274.0]|
+------+-----+--------+
only showing top 20 rows

+--------+-----+-----------------+--------------------+
|features|label|prediction       |err                 |
+--------+-----+-----------------+--------------------+
|[249.0] |83.8 |84.16290127068545|0.36289821892763996 |
|[252.0] |84.65|84.55262709799399|-0.09737442788491535|
|[253.0] |84.35|84.6825357070

8/25/1995 0.7426 96.65
8/28/1995 0.7426 96.5
8/29/1995 0.7498 96.7
8/30/1995 0.7523 98.3
+------+-----+--------+
|period|label|features|
+------+-----+--------+
|   275| 87.5| [275.0]|
|   276|87.65| [276.0]|
|   277| 88.7| [277.0]|
|   278| 88.1| [278.0]|
|   280|87.85| [280.0]|
|   281| 87.8| [281.0]|
|   285| 88.9| [285.0]|
|   286|90.75| [286.0]|
|   287| 90.8| [287.0]|
|   288| 91.1| [288.0]|
|   289|91.25| [289.0]|
|   291| 92.1| [291.0]|
|   292| 93.5| [292.0]|
|   295| 98.4| [295.0]|
|   297| 97.5| [297.0]|
|   298| 97.0| [298.0]|
|   299|96.75| [299.0]|
|   302|96.65| [302.0]|
|   303| 96.5| [303.0]|
|   304| 96.7| [304.0]|
+------+-----+--------+
only showing top 20 rows

+--------+-----+-----------------+--------------------+
|features|label|prediction       |err                 |
+--------+-----+-----------------+--------------------+
|[279.0] |87.6 |88.28484826304305|0.6848497889219516  |
|[282.0] |88.15|89.46150576777475|1.3115042418958467  |
|[283.0] |88.43|89.8537249360

+--------+------+------------------+-------------------+
|features|label |prediction        |err                |
+--------+------+------------------+-------------------+
|[304.0] |96.7  |98.36928619277077 |1.669289244528585  |
|[307.0] |97.6  |98.79328193020115 |1.193283456080053  |
|[308.0] |97.2  |98.93461384267795 |1.73461689443576   |
|[309.0] |98.0  |99.07594575515472 |1.07594575515472   |
|[315.0] |101.9 |99.92393723001547 |-1.9760642958634378|
|[318.0] |104.2 |100.34793296744584|-3.852063980796345 |
|[319.0] |103.5 |100.48926487992264|-3.0107351200773564|
|[321.0] |102.45|100.77192870487622|-1.6780682433659706|
|[325.0] |100.65|101.33725635478339|0.6872548289044857 |
|[326.0] |100.4 |101.47858826726016|1.0785867413812582 |
+--------+------+------------------+-------------------+

10/5/1995 0.7638 100.3
10/6/1995 0.7597 100.1
10/9/1995 0.7642 100.3
10/10/1995 0.7592 107.97
10/11/1995 0.7618 100.7
+------+------+--------+
|period| label|features|
+------+------+--------+
|   305|

KeyboardInterrupt: 

In [None]:
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

dataSchema = StructType([StructField("JPY", IntegerType()) \
                        ,StructField("period", IntegerType())])

a = [1,2,3,4]
b = [5,6,7,8]

temp_list = []
data_list = []

for i in range(len(a)):
    temp_list.append(a[i])
    temp_list.append(b[i])
    data_list.append(temp_list)
    
    temp_list = []

    df = spark.createDataFrame(data_list, schema = dataSchema)
#SQLContext.createDataFrame((np.array([a, b])).T, schema = ['a', 'b']).show()

In [None]:
print(data_list)

In [None]:
 df.show()

In [None]:
df1 = df.select(df.period,df.JPY.alias('label'))
df1.show()
# Split Dataset training : 70%, test : 30%
training,test = df1.randomSplit([0.7,0.3],seed = 100)
training.show()
test.show()

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt
import numpy as np
from numpy import polyfit
from pyspark.sql.types import *
%matplotlib inline

df.show()
# Make Linear Regression Model
assembler = VectorAssembler().setInputCols(['period',]).setOutputCol('features')
trainingSet = assembler.transform(training)
trainingSet2 = trainingSet.select("features","label")
#trainingSet2.show(truncate=False)
#trainingSet2.show(truncate=False)

# Define Linear Regression training step with 10 iterations
lr = LinearRegression(maxIter = 10)
lr_Model = lr.fit(trainingSet2)
print(lr_Model)
#training.show()

# Test the model using testing dataset
testSet1 = assembler.transform(test)
testSet1.show(truncate=False)
testSet2 = testSet1.select("features","label")
testSet3 = lr_Model.transform(testSet2)
testSet3.show(truncate=False)
        
result1 = testSet3.select("features","label","prediction",(testSet3.prediction - testSet3.label).alias("err"))

result1.show(truncate=False)

In [None]:
# Make Graph
plt.scatter(training['period'], training['label'], c = 'C0', label = 'Linear Regression')
plt.xlabel("Time")
plt.ylabel("JPY")
p1 = polyfit(training['period'], training['label'], 1)
plt.plot(training['period'], np.polyval(p1, training['period']), 'g-')
plt.show()