In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, expr, when

In [4]:
sc = pyspark.SparkContext(appName="ModelTraining")

In [5]:
spark = SQLContext(sc)

In [6]:
import pandas as pd
import numpy as np

In [7]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

In [8]:
data = pd.read_csv("ibm.us.txt")
dataList = data['Close'].tolist()
dataListFormatted = [ '%.4f' % elem for elem in dataList ]
dataListFormatted

matrix = list(chunks(dataListFormatted, 6))
matrix

[['6.3378', '6.3963', '6.3295', '6.2041', '6.0870', '6.1621'],
 ['6.1707', '6.2376', '6.2543', '6.2792', '6.2128', '6.1125'],
 ['6.1291', '6.1374', '6.1208', '6.0624', '6.0956', '6.0287'],
 ['5.9951', '5.8952', '5.8201', '6.0036', '6.1457', '6.1875'],
 ['6.1040', '6.1125', '6.1208', '6.1208', '6.1707', '6.1707'],
 ['6.0703', '6.0624', '6.0373', '6.0373', '5.9703', '5.9787'],
 ['5.9285', '5.9537', '5.9285', '5.9370', '5.9537', '6.0208'],
 ['6.0703', '6.0538', '6.0538', '6.0790', '6.0870', '6.1291'],
 ['6.1208', '6.1208', '6.1040', '6.0956', '6.1208', '6.1374'],
 ['6.1040', '6.1040', '6.0373', '6.0624', '6.0624', '5.9454'],
 ['5.9787', '5.9622', '5.9037', '5.8282', '5.6781', '5.6528'],
 ['5.7782', '5.7698', '5.7782', '5.7949', '5.7197', '5.6614'],
 ['5.7453', '5.7367', '5.7616', '5.7533', '5.7533', '5.7453'],
 ['5.7453', '5.6781', '5.5361', '5.3774', '5.0353', '5.1189'],
 ['5.2356', '5.3441', '5.3858', '5.2689', '5.0937', '5.0017'],
 ['5.0184', '4.9431', '5.0353', '5.0099', '5.0685', '4.

In [9]:
df = pd.DataFrame(matrix, columns =['n5', 'n4', 'n3', 'n2', 'n1', 'n'])
df["label"] = np.where((df['n'] > df['n1']), 1.0, 0.0)
print(df.shape)
df.drop(df.tail(1).index,inplace=True)
print(df.shape)
df = df.drop(['n'], axis=1)
df.tail()


(2344, 7)
(2343, 7)


Unnamed: 0,n5,n4,n3,n2,n1,label
2338,143.64,145.21,145.33,145.03,145.27,0.0
2339,145.93,147.03,146.16,145.57,145.64,0.0
2340,145.09,157.95,159.31,160.47,157.97,0.0
2341,151.98,152.08,152.16,152.83,152.54,0.0
2342,151.83,150.08,149.35,149.85,150.07,1.0


In [10]:
from pyspark.sql.types import StructField, StructType, DoubleType
from pyspark.sql.functions import col
import pyspark.sql.functions as F

spark_df = spark.createDataFrame(df)
df_train = spark_df.select([col(c).cast("double") for c in spark_df.columns])
df_train.show()

+------+------+------+------+------+-----+
|    n5|    n4|    n3|    n2|    n1|label|
+------+------+------+------+------+-----+
|6.3378|6.3963|6.3295|6.2041| 6.087|  1.0|
|6.1707|6.2376|6.2543|6.2792|6.2128|  0.0|
|6.1291|6.1374|6.1208|6.0624|6.0956|  0.0|
|5.9951|5.8952|5.8201|6.0036|6.1457|  1.0|
| 6.104|6.1125|6.1208|6.1208|6.1707|  0.0|
|6.0703|6.0624|6.0373|6.0373|5.9703|  1.0|
|5.9285|5.9537|5.9285| 5.937|5.9537|  1.0|
|6.0703|6.0538|6.0538| 6.079| 6.087|  1.0|
|6.1208|6.1208| 6.104|6.0956|6.1208|  1.0|
| 6.104| 6.104|6.0373|6.0624|6.0624|  0.0|
|5.9787|5.9622|5.9037|5.8282|5.6781|  0.0|
|5.7782|5.7698|5.7782|5.7949|5.7197|  0.0|
|5.7453|5.7367|5.7616|5.7533|5.7533|  0.0|
|5.7453|5.6781|5.5361|5.3774|5.0353|  1.0|
|5.2356|5.3441|5.3858|5.2689|5.0937|  0.0|
|5.0184|4.9431|5.0353|5.0099|5.0685|  0.0|
|5.0017|4.9849|4.8182|4.6008|4.4089|  1.0|
|3.9997|4.2417|4.3504| 4.267|3.9913|  1.0|
|4.0497|   4.1|4.1169|3.9829|3.7159|  0.0|
|3.3901|3.6992|3.6572|3.7241|3.6321|  0.0|
+------+---

In [11]:
df_train.printSchema()

root
 |-- n5: double (nullable = true)
 |-- n4: double (nullable = true)
 |-- n3: double (nullable = true)
 |-- n2: double (nullable = true)
 |-- n1: double (nullable = true)
 |-- label: double (nullable = true)



In [12]:
assembler = VectorAssembler(
    inputCols=['n5', 'n4', 'n3', 'n2', 'n1'],
    outputCol="features")

In [13]:
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[assembler, lr])

In [14]:
model = pipeline.fit(df_train)
model

PipelineModel_d71c425ebdd0

In [15]:
df = model.transform(df_train)
df.printSchema()

root
 |-- n5: double (nullable = true)
 |-- n4: double (nullable = true)
 |-- n3: double (nullable = true)
 |-- n2: double (nullable = true)
 |-- n1: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [16]:
model.write().overwrite().save("FINAL_MODEL")