In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [3]:
import mleap.pyspark

In [4]:
spark = SparkSession.builder.master("local").enableHiveSupport().getOrCreate()

In [5]:
df = pd.read_csv("auto-miles-per-gallon.csv",sep=",")

In [6]:
df.head(40)

Unnamed: 0,MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME
0,118.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu
1,115.0,8,350.0,165,3693,11.5,70,buick skylark 320
2,118.0,8,318.0,150,3436,11.0,70,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,ford torino
5,15.0,8,429.0,198,4341,10.0,70,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,amc ambassador dpl


In [7]:
df.replace('?' , 10).head(40)

Unnamed: 0,MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME
0,118.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu
1,115.0,8,350.0,165,3693,11.5,70,buick skylark 320
2,118.0,8,318.0,150,3436,11.0,70,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,ford torino
5,15.0,8,429.0,198,4341,10.0,70,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,amc ambassador dpl


In [8]:
schema = StructType([
StructField("mpg", FloatType(), False),
StructField("cylinders", IntegerType(), False),
StructField("displacement", FloatType(), False),
StructField("horsepower", IntegerType(), False),
StructField("weight", IntegerType(), False),
StructField("acceleration", FloatType(), False),
StructField("modelyear", IntegerType(), False),
StructField("name", StringType(), False)])

In [9]:
dataset=spark.createDataFrame(df, schema)

In [10]:
categoricalColumns = ["name"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

In [12]:
numericCols = ["cylinders","displacement","horsepower", "weight", "acceleration","modelyear"]
assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [13]:
# Create a StandardScaler for the features

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",withStd=True, withMean=False)
stages += [scaler]

In [14]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "mpg", outputCol = "label")
stages += [label_stringIdx]

In [15]:
lr = LinearRegression(labelCol="label",featuresCol="scaledFeatures",maxIter=10, regParam=0.3, elasticNetParam=0.8)
stages += [lr]

In [16]:
# Define Pipeline
lr_pipeline = Pipeline(stages=stages)
lrModel = lr_pipeline.fit(dataset)
prediction=lrModel.transform(dataset)
prediction.toPandas()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,modelyear,name,nameIndex,nameclassVec,features,scaledFeatures,label,prediction
0,118.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu,53.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.0,29.115193
1,115.0,8,350.0,165,3693,11.5,70,buick skylark 320,280.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",77.0,71.318188
2,118.0,8,318.0,150,3436,11.0,70,plymouth satellite,174.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.0,49.565179
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst,102.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,-0.934539
4,17.0,8,302.0,140,3449,10.5,70,ford torino,61.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",17.0,11.207021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5965,27.0,4,140.0,86,2790,15.6,82,ford mustang gl,268.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13.0,19.436733
5966,44.0,4,97.0,52,2130,24.6,82,vw pickup,236.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",110.0,105.507974
5967,32.0,4,135.0,84,2295,11.6,82,dodge rampage,62.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20.0,25.559452
5968,28.0,4,120.0,79,2625,18.6,82,ford ranger,266.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.0,14.960101


In [17]:
# Define Pipeline
lr_pipeline = Pipeline(stages=stages)
lrModel = lr_pipeline.fit(dataset)
prediction=lrModel.transform(dataset)
prediction.toPandas()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,modelyear,name,nameIndex,nameclassVec,features,scaledFeatures,label,prediction
0,118.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu,53.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.0,29.115193
1,115.0,8,350.0,165,3693,11.5,70,buick skylark 320,280.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",77.0,71.318188
2,118.0,8,318.0,150,3436,11.0,70,plymouth satellite,174.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54.0,49.565179
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst,102.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,-0.934539
4,17.0,8,302.0,140,3449,10.5,70,ford torino,61.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",17.0,11.207021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5965,27.0,4,140.0,86,2790,15.6,82,ford mustang gl,268.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",13.0,19.436733
5966,44.0,4,97.0,52,2130,24.6,82,vw pickup,236.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",110.0,105.507974
5967,32.0,4,135.0,84,2295,11.6,82,dodge rampage,62.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20.0,25.559452
5968,28.0,4,120.0,79,2625,18.6,82,ford ranger,266.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.0,14.960101


In [18]:
predictionAndLabels = prediction.select("prediction","label")
predictionAndLabels.show()

+-------------------+-----+
|         prediction|label|
+-------------------+-----+
| 29.115193416927696| 54.0|
|  71.31818778238159| 77.0|
|  49.56517867808071| 54.0|
|-0.9345386123603703|  5.0|
| 11.207020980495372| 17.0|
|  -4.54419636310422|  2.0|
|-2.1057396992501367|  1.0|
| -3.867210494566166|  1.0|
| -5.862998985399713|  1.0|
| -3.016574225923762|  2.0|
|-3.5485512611838317|  2.0|
| -1.850516992776022|  1.0|
|-3.5452234288663362|  2.0|
|  37.76500280273194|  1.0|
|  2.665888225624883|  8.0|
|-2.2590767514711843| 11.0|
|  18.08213505216753|  3.0|
| -5.081823094394963| 14.0|
|  6.533703637219105| 13.0|
|-1.4477706055261024|  4.0|
+-------------------+-----+
only showing top 20 rows



In [19]:
lrModel.write().overwrite().save("hdfs://localhost/home/opentext/bda/ML_Model/LinearRegression")