In [47]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, expr, when
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

training.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [48]:
import pandas as pd
import numpy as np

In [49]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

In [50]:
data = pd.read_csv("aac.us.txt")
dataList = data['Close'].tolist()
dataListFormatted = [ '%.4f' % elem for elem in dataList ]
dataListFormatted

matrix = list(chunks(dataListFormatted, 6))
matrix

[['18.5000', '18.6500', '19.2400', '19.2400', '19.1300', '19.2500'],
 ['19.0000', '19.0000', '18.1000', '19.4300', '19.4000', '19.0800'],
 ['19.0300', '19.3200', '20.5200', '21.0500', '22.0000', '22.0300'],
 ['22.0000', '21.3900', '21.2000', '21.7800', '22.0000', '21.8500'],
 ['21.6000', '23.3500', '25.5000', '26.4800', '24.9600', '25.0000'],
 ['26.7600', '26.4600', '28.0300', '29.3500', '29.1400', '27.0200'],
 ['26.0400', '26.6000', '28.4400', '28.7800', '28.8600', '29.5900'],
 ['31.6100', '31.1100', '30.4600', '31.7100', '31.0000', '32.4100'],
 ['29.5900', '29.7800', '29.9600', '29.7000', '29.4000', '29.7500'],
 ['29.7900', '29.5000', '29.6000', '29.5000', '29.3500', '29.7600'],
 ['29.8900', '30.9600', '30.9200', '30.2100', '29.7300', '29.3800'],
 ['28.6400', '29.3700', '30.1300', '29.2700', '29.7300', '29.6500'],
 ['28.7000', '30.1300', '29.0600', '28.9600', '29.0700', '27.8900'],
 ['28.3000', '27.7200', '26.6200', '26.2700', '25.5200', '25.5900'],
 ['27.4600', '27.3400', '28.5000',

In [51]:
df = pd.DataFrame(matrix, columns =['n5', 'n4', 'n3', 'n2', 'n1', 'n'])
df["label"] = np.where((df['n'] > df['n1']), 1.0, 0.0)
print(df.shape)
df.drop(df.tail(1).index,inplace=True)
print(df.shape)
df.tail()
print(df.shape)
df

(131, 7)
(130, 7)
(130, 7)


Unnamed: 0,n5,n4,n3,n2,n1,n,label
0,18.5000,18.6500,19.2400,19.2400,19.1300,19.2500,1.0
1,19.0000,19.0000,18.1000,19.4300,19.4000,19.0800,0.0
2,19.0300,19.3200,20.5200,21.0500,22.0000,22.0300,1.0
3,22.0000,21.3900,21.2000,21.7800,22.0000,21.8500,0.0
4,21.6000,23.3500,25.5000,26.4800,24.9600,25.0000,1.0
5,26.7600,26.4600,28.0300,29.3500,29.1400,27.0200,0.0
6,26.0400,26.6000,28.4400,28.7800,28.8600,29.5900,1.0
7,31.6100,31.1100,30.4600,31.7100,31.0000,32.4100,1.0
8,29.5900,29.7800,29.9600,29.7000,29.4000,29.7500,1.0
9,29.7900,29.5000,29.6000,29.5000,29.3500,29.7600,1.0


In [52]:
from pyspark.sql.types import StructField, StructType, DoubleType
from pyspark.sql.functions import col
import pyspark.sql.functions as F

spark_df = sqlContext.createDataFrame(df)
df_train = spark_df.select([col(c).cast("double") for c in spark_df.columns])
df_train.show()

+-----+-----+-----+-----+-----+-----+-----+
|   n5|   n4|   n3|   n2|   n1|    n|label|
+-----+-----+-----+-----+-----+-----+-----+
| 18.5|18.65|19.24|19.24|19.13|19.25|  1.0|
| 19.0| 19.0| 18.1|19.43| 19.4|19.08|  0.0|
|19.03|19.32|20.52|21.05| 22.0|22.03|  1.0|
| 22.0|21.39| 21.2|21.78| 22.0|21.85|  0.0|
| 21.6|23.35| 25.5|26.48|24.96| 25.0|  1.0|
|26.76|26.46|28.03|29.35|29.14|27.02|  0.0|
|26.04| 26.6|28.44|28.78|28.86|29.59|  1.0|
|31.61|31.11|30.46|31.71| 31.0|32.41|  1.0|
|29.59|29.78|29.96| 29.7| 29.4|29.75|  1.0|
|29.79| 29.5| 29.6| 29.5|29.35|29.76|  1.0|
|29.89|30.96|30.92|30.21|29.73|29.38|  0.0|
|28.64|29.37|30.13|29.27|29.73|29.65|  0.0|
| 28.7|30.13|29.06|28.96|29.07|27.89|  0.0|
| 28.3|27.72|26.62|26.27|25.52|25.59|  1.0|
|27.46|27.34| 28.5|28.66|28.15|27.92|  0.0|
|27.91| 28.2|29.11|29.36|29.45|29.38|  0.0|
|30.39| 32.1|33.26|37.33|36.61|35.99|  0.0|
|34.27|30.73|32.66| 32.1| 29.3| 26.2|  0.0|
| 25.8|24.65|26.13|27.74|28.79|29.54|  1.0|
|29.77|30.19|30.71| 28.8|28.14|2

In [53]:
assembler = VectorAssembler(
    inputCols=["n5", "n4", "n3", "n2", "n1"],
    outputCol="features")
df_train_assembled = assembler.transform(df_train)
df_train_assembled.show()

+-----+-----+-----+-----+-----+-----+-----+--------------------+
|   n5|   n4|   n3|   n2|   n1|    n|label|            features|
+-----+-----+-----+-----+-----+-----+-----+--------------------+
| 18.5|18.65|19.24|19.24|19.13|19.25|  1.0|[18.5,18.65,19.24...|
| 19.0| 19.0| 18.1|19.43| 19.4|19.08|  0.0|[19.0,19.0,18.1,1...|
|19.03|19.32|20.52|21.05| 22.0|22.03|  1.0|[19.03,19.32,20.5...|
| 22.0|21.39| 21.2|21.78| 22.0|21.85|  0.0|[22.0,21.39,21.2,...|
| 21.6|23.35| 25.5|26.48|24.96| 25.0|  1.0|[21.6,23.35,25.5,...|
|26.76|26.46|28.03|29.35|29.14|27.02|  0.0|[26.76,26.46,28.0...|
|26.04| 26.6|28.44|28.78|28.86|29.59|  1.0|[26.04,26.6,28.44...|
|31.61|31.11|30.46|31.71| 31.0|32.41|  1.0|[31.61,31.11,30.4...|
|29.59|29.78|29.96| 29.7| 29.4|29.75|  1.0|[29.59,29.78,29.9...|
|29.79| 29.5| 29.6| 29.5|29.35|29.76|  1.0|[29.79,29.5,29.6,...|
|29.89|30.96|30.92|30.21|29.73|29.38|  0.0|[29.89,30.96,30.9...|
|28.64|29.37|30.13|29.27|29.73|29.65|  0.0|[28.64,29.37,30.1...|
| 28.7|30.13|29.06|28.96|

In [54]:
lr = LogisticRegression(maxIter=10, regParam=0.01)
model = lr.fit(df_train_assembled)

paramMap = {lr.maxIter: 30}
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})

model2 = lr.fit(df_train_assembled, paramMap)

In [55]:
test = spark.createDataFrame([
    (1.0, Vectors.dense(6.3378,6.3963,6.3295,6.2041, 6.087,6.1621))], ["label", "features"])
test.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[6.3378,6.3963,6....|
+-----+--------------------+



In [60]:
prediction = model2.transform(df_train_assembled)
model2.save("LRModel")
prediction.collect()
prediction.show()

+-----+-----+-----+-----+-----+-----+-----+--------------------+--------------------+--------------------+----------+
|   n5|   n4|   n3|   n2|   n1|    n|label|            features|       rawPrediction|         probability|prediction|
+-----+-----+-----+-----+-----+-----+-----+--------------------+--------------------+--------------------+----------+
| 18.5|18.65|19.24|19.24|19.13|19.25|  1.0|[18.5,18.65,19.24...|[-0.2957583121454...|[0.42659472500629...|       1.0|
| 19.0| 19.0| 18.1|19.43| 19.4|19.08|  0.0|[19.0,19.0,18.1,1...|[-0.2956651537178...|[0.42661751280024...|       1.0|
|19.03|19.32|20.52|21.05| 22.0|22.03|  1.0|[19.03,19.32,20.5...|[-0.2675617486956...|[0.43350577981689...|       1.0|
| 22.0|21.39| 21.2|21.78| 22.0|21.85|  0.0|[22.0,21.39,21.2,...|[-0.2341630427037...|[0.44172527454785...|       1.0|
| 21.6|23.35| 25.5|26.48|24.96| 25.0|  1.0|[21.6,23.35,25.5,...|[-0.1730644488039...|[0.45684155503768...|       0.0|
|26.76|26.46|28.03|29.35|29.14|27.02|  0.0|[26.76,26.46,

In [61]:
result = prediction.select("features", "label", "probability", "prediction").collect()
for row in result:
    print("features=%s, label=%s -> prob=%s, prediction=%s"
          % (row.features, row.label, row.probability, row.prediction))

features=[18.5,18.65,19.24,19.24,19.13], label=1.0 -> prob=[0.42659472500629164,0.5734052749937084], prediction=1.0
features=[19.0,19.0,18.1,19.43,19.4], label=0.0 -> prob=[0.4266175128002429,0.5733824871997572], prediction=1.0
features=[19.03,19.32,20.52,21.05,22.0], label=1.0 -> prob=[0.43350577981689326,0.5664942201831067], prediction=1.0
features=[22.0,21.39,21.2,21.78,22.0], label=0.0 -> prob=[0.44172527454785426,0.5582747254521457], prediction=1.0
features=[21.6,23.35,25.5,26.48,24.96], label=1.0 -> prob=[0.45684155503768914,0.5431584449623108], prediction=0.0
features=[26.76,26.46,28.03,29.35,29.14], label=0.0 -> prob=[0.4766449200317283,0.5233550799682717], prediction=0.0
features=[26.04,26.6,28.44,28.78,28.86], label=1.0 -> prob=[0.4754351977113651,0.5245648022886349], prediction=0.0
features=[31.61,31.11,30.46,31.71,31.0], label=1.0 -> prob=[0.49597612750332853,0.5040238724966714], prediction=0.0
features=[29.59,29.78,29.96,29.7,29.4], label=1.0 -> prob=[0.4874626061000948,0.