In [4]:
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.functions import min
from pyspark.sql.functions import max

sc = SparkContext.getOrCreate()

if (sc is None):
    sc = SparkContext(master="local[*]", appName="Linear Regression")
spark = SparkSession(sparkContext=sc)
    
sqlcontext = SQLContext(sc)

#load the file into variable. make sure you read the header and schema for the table
data = sqlcontext.read.csv('yen-au.csv', header=True, inferSchema=True )

In [5]:
#collect average of each collumn for missing values 
au = data.select(mean('AUD')).collect()[0]
jp = data.select(mean('JPY')).collect()[0]

In [6]:
#fill missing value
data = data.fillna({'AUD': round(au[0],4)})
data = data.fillna({'JPY': round(jp[0],2)})

#display the schema for the data
data.printSchema()
data.show()

root
 |-- Date: string (nullable = true)
 |-- AUD: double (nullable = false)
 |-- JPY: double (nullable = false)

+---------+------+------+
|     Date|   AUD|   JPY|
+---------+------+------+
| 1/2/1995|0.7683|107.97|
| 1/3/1995|0.7683|107.97|
| 1/4/1995|0.7704|100.98|
| 1/5/1995|0.7693| 101.0|
| 1/6/1995|0.7699|100.95|
| 1/9/1995|0.7658|101.05|
|1/10/1995|0.7643|100.18|
|1/11/1995| 0.767| 99.85|
|1/12/1995|0.7706| 100.0|
|1/13/1995|0.7613| 98.85|
|1/16/1995|0.7593|107.97|
|1/17/1995|0.7616|  99.0|
|1/18/1995|0.7596| 98.92|
|1/19/1995|0.7613| 99.45|
|1/20/1995|0.7674| 99.15|
|1/23/1995|0.7701|  99.9|
|1/24/1995|0.7698| 99.75|
|1/25/1995|0.7711|  99.6|
|1/26/1995|0.7683| 99.53|
|1/27/1995|0.7625|  99.3|
+---------+------+------+
only showing top 20 rows



In [7]:
from time import sleep
from json import dumps
from kafka import KafkaProducer
import random

producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                         value_serializer=lambda x:
                         dumps(x).encode('utf-8'))

In [8]:
from pyspark.sql.functions import monotonically_increasing_id 

df_index = data.select("*").withColumn("index", monotonically_increasing_id())

In [10]:
df_index.select('*').show()

+---------+------+------+-----+
|     Date|   AUD|   JPY|index|
+---------+------+------+-----+
| 1/2/1995|0.7683|107.97|    0|
| 1/3/1995|0.7683|107.97|    1|
| 1/4/1995|0.7704|100.98|    2|
| 1/5/1995|0.7693| 101.0|    3|
| 1/6/1995|0.7699|100.95|    4|
| 1/9/1995|0.7658|101.05|    5|
|1/10/1995|0.7643|100.18|    6|
|1/11/1995| 0.767| 99.85|    7|
|1/12/1995|0.7706| 100.0|    8|
|1/13/1995|0.7613| 98.85|    9|
|1/16/1995|0.7593|107.97|   10|
|1/17/1995|0.7616|  99.0|   11|
|1/18/1995|0.7596| 98.92|   12|
|1/19/1995|0.7613| 99.45|   13|
|1/20/1995|0.7674| 99.15|   14|
|1/23/1995|0.7701|  99.9|   15|
|1/24/1995|0.7698| 99.75|   16|
|1/25/1995|0.7711|  99.6|   17|
|1/26/1995|0.7683| 99.53|   18|
|1/27/1995|0.7625|  99.3|   19|
+---------+------+------+-----+
only showing top 20 rows



In [21]:
for i in range(df_index.count()-1):
    #data = {'row': df_index.select('last_name').filter(df_index.index == 0)}
    data = {'row': str(df_index.select("Date").filter(df_index.index == i).collect()[0][0]) + \
            " "+ str(df_index.select("AUD").filter(df_index.index == i).collect()[0][0]) +" "+ \
                      str(df_index.select("JPY").filter(df_index.index == i).collect()[0][0])}
    producer.send('dfTest', value=data)
    sleep(1)

KeyboardInterrupt: 