In [1]:
import pyspark as spark
from pyspark import SparkContext
# initialize a new Spark Context to use for the execution of the script
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")
# prevent useless logging messages
sc.setLogLevel("ERROR")

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import FMClassifier
import numpy as np
from pyspark.mllib.clustering import KMeans
import matplotlib.pyplot as plt
from pyspark.sql.functions import col
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
import matplotlib.pyplot as plt

spark = SparkSession.builder \
    .master("local") \
    .appName("appName") \
    .getOrCreate()

In [3]:
def performance_metrics(predicted_values):
    eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

    #Root Mean Square Error
    rmse = eval.evaluate(predicted_values)
    print("RMSE: %.3f" % rmse)

    # Mean Square Error
    mse = eval.evaluate(predicted_values, {eval.metricName: "mse"})
    print("MSE: %.3f" % mse)

    # Mean Absolute Error
    mae = eval.evaluate(predicted_values, {eval.metricName: "mae"})
    print("MAE: %.3f" % mae)

    # r2 - coefficient of determination
    r2 = eval.evaluate(predicted_values, {eval.metricName: "r2"})
    print("r2: %.3f" %r2)

In [4]:
rf = DecisionTreeRegressor(seed = 69)

paramGrid = ParamGridBuilder()\
.addGrid(rf.maxBins, list(np.arange(2, 50, 2)))\
.addGrid(rf.maxDepth, list(np.arange(2, 28, 2)))\
.build()

tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2"), 
                           trainRatio=0.8)

In [5]:
data = spark.read.options(header='true', inferschema='true', delimiter=',').csv("data/df.csv")
data.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|        q_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 1.712044550044049| 4.787830588906654|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.452311677836793|14.032851614737082|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9888939958281715|  7.08478856513968|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 2.781362935991756| 18.63630522318597|
|      1238| 50| 50| -4.046307639606464|34.68797

# Regression #1 
## Regress m and q parameters for the pressure attribute indepentently

## Regression m_pressure

In [6]:
features = ['breath_ids', 'R', 'C', 'm_u_in', 'q_u_in', 'm_u_out', 'q_u_out']
lr_data = data.select(col('m_pressure').alias("label"), *features)

In [7]:
train, test = lr_data.randomSplit([0.8, 0.2], seed=69)

In [8]:
train.show(5)

+------------------+----------+---+---+-------------------+------------------+------------------+------------------+
|             label|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|
+------------------+----------+---+---+-------------------+------------------+------------------+------------------+
|-7.866856074833947|     88590| 50| 20| -25.41676240979036| 65.80159905394707| 3.594318209183017|-4.709952037521332|
|-7.663688146587497|    116649| 50| 20|-21.639086214137617| 54.87565125780993| 3.594773597953297|-4.709796481642885|
|-7.554157606521539|     20210| 50| 20| -21.45036564244344|53.970380395461575|  3.59429508158818|-4.713761437371403|
|-7.507951297643809|    124292| 50| 20|-21.908552023117497| 55.37320489748355| 3.594386690014932|-4.714122611281731|
|-7.476816741717444|      9323| 50| 20| -21.41138677291981| 53.63256385162418|3.5943172325696486|-4.712043390372973|
+------------------+----------+---+---+-------------------+-----

In [9]:
#prepare the data and the pipeline
for_prediction = features.copy()
for_prediction.remove('breath_ids')
vectorAssembler = VectorAssembler(inputCols=for_prediction, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
stages = [vectorAssembler, standardScaler, tvs]
pipeline = Pipeline(stages=stages)

### Models performances for m_pressure

In [10]:
model = pipeline.fit(train)
prediction = model.transform(test)

In [11]:
performance_metrics(prediction)

RMSE: 1.144
MSE: 1.308
MAE: 0.780
r2: 0.816


In [12]:
print('Best Param (MaxBins): ',rf.getMaxBins())
print('Best Param (MaxDepth): ',rf.getMaxDepth())

Best Param (MaxBins):  32
Best Param (MaxDepth):  5


### Get predicted m to the original dataset

In [13]:
m_pressure = model.transform(lr_data)
m_press_predicted = m_pressure.select([col("breath_ids").alias("b_id"), 'prediction'])

## Regression for q_pressure

In [14]:
lr_data = data.select(col('q_pressure').alias("label"), *features)

In [15]:
train, test = lr_data.randomSplit([0.8, 0.2], seed=69)

In [16]:
#prepare the data and the pipeline
vectorAssembler = VectorAssembler(inputCols=for_prediction, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
stages = [vectorAssembler, standardScaler, tvs]
pipeline = Pipeline(stages=stages)

### Test performances on q_pressure

In [17]:
model = pipeline.fit(train)
prediction = model.transform(test)

In [18]:
print(for_prediction)

['R', 'C', 'm_u_in', 'q_u_in', 'm_u_out', 'q_u_out']


In [19]:
performance_metrics(prediction)

RMSE: 1.340
MSE: 1.797
MAE: 0.908
r2: 0.954


In [20]:
print('Best Param (MaxBins): ',rf.getMaxBins())
print('Best Param (MaxDepth): ',rf.getMaxDepth())

Best Param (MaxBins):  32
Best Param (MaxDepth):  5


### Get predicted q to the original dataset 

In [21]:
q_pressure = model.transform(lr_data)
q_press_predicted = q_pressure.select([col("breath_ids").alias("b_id"), 'prediction'])

# Output from the first regression technique

In [22]:
df_regressed_1 = data.select(features)
df_regressed_1 = df_regressed_1.join(m_press_predicted, m_press_predicted.b_id == df_regressed_1.breath_ids).select(*df_regressed_1.columns, col('prediction').alias('m_pressure'))
df_regressed_1 = df_regressed_1.join(q_press_predicted, q_press_predicted.b_id == df_regressed_1.breath_ids).select(*df_regressed_1.columns, col('prediction').alias('q_pressure'))
df_regressed_1.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|        q_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 3.215810819051385|  7.85657577102829|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.485038861247911|15.784057462340838|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9408138835315714| 7.889961357068623|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 3.791803684217059|22.404974025738664|
|      1238| 50| 50| -4.046307639606464|34.68797

# Regression #2
### 1. Regress the m parameter for pressure
### 2. Include the predicted value of m in the training set when predicting the q

In [23]:
data.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|        q_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 1.712044550044049| 4.787830588906654|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.452311677836793|14.032851614737082|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9888939958281715|  7.08478856513968|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 2.781362935991756| 18.63630522318597|
|      1238| 50| 50| -4.046307639606464|34.68797

In [24]:
# Append to the original dataset the m_pressure field calculated from the step before
df_m = data.select(features)
df_m = data.join(m_press_predicted, m_press_predicted.b_id == df_m.breath_ids).select(*df_m.columns, col('prediction').alias('m_pressure'))
df_m.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 3.215810819051385|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.485038861247911|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9408138835315714|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 3.791803684217059|
|      1238| 50| 50| -4.046307639606464|34.687972375171306|3.5981683352545537|-4.872868892058087|2.0285150589526952|
+----------+---+---+-------------------+------------------+-----

In [25]:
m_features = ['breath_ids', 'R', 'C', 'm_u_in', 'q_u_in', 'm_u_out', 'q_u_out', 'm_pressure']
q = data.select('q_pressure', col('breath_ids').alias('b_id'))
lr_data = df_m.join(q, q.b_id == df_m.breath_ids).select(*df_m.columns, col('q_pressure').alias("label"))
lr_data.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|             label|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 3.215810819051385| 4.787830588906654|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.485038861247911|14.032851614737082|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9408138835315714|  7.08478856513968|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 3.791803684217059| 18.63630522318597|
|      1238| 50| 50| -4.046307639606464|34.68797

In [26]:
train, test = lr_data.randomSplit([0.8, 0.2], seed=69)

In [27]:
#prepare the data and the pipeline
for_prediction = m_features.copy()
for_prediction.remove('breath_ids')
vectorAssembler = VectorAssembler(inputCols=for_prediction, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
stages = [vectorAssembler, standardScaler, tvs]
pipeline = Pipeline(stages=stages)

## Regression for q_pressure

In [28]:
model = pipeline.fit(train)
prediction = model.transform(test)

In [29]:
performance_metrics(prediction)

RMSE: 1.406
MSE: 1.976
MAE: 0.917
r2: 0.949


In [30]:
prediction.show(5)

+----------+---+---+-------------------+-----------------+------------------+------------------+--------------------+------------------+--------------------+--------------------+------------------+
|breath_ids|  R|  C|             m_u_in|           q_u_in|           m_u_out|           q_u_out|          m_pressure|             label|   unscaled_features|            features|        prediction|
+----------+---+---+-------------------+-----------------+------------------+------------------+--------------------+------------------+--------------------+--------------------+------------------+
|       133|  5| 10|  -19.6014207492924|46.98668700460304| 3.594566274579692| -4.71231980483322|   5.038300569949913|18.158814395307186|[5.0,10.0,-19.601...|[0.25558208403815...|17.067438719442084|
|       177|  5| 50| -23.32346938584597|71.05051858227964|3.5843312532246725|-4.381444835800677|   3.044746695101259|11.163875805026365|[5.0,50.0,-23.323...|[0.25558208403815...|11.073593311822473|
|       19

## Output from the second regression technique

In [31]:
q_pressure.show(5)

+------------------+----------+---+---+-------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+
|             label|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|   unscaled_features|            features|        prediction|
+------------------+----------+---+---+-------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+
| 4.787830588906654|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503|[50.0,10.0,-0.627...|[2.55557426626199...|  7.85657577102829|
|14.032851614737082|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274|[50.0,10.0,-2.730...|[2.55557426626199...|15.784057462340838|
|  7.08478856513968|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.5472092137

In [32]:
q_pressure = model.transform(lr_data)
q_press_predicted = q_pressure.select([col("breath_ids").alias("b_id"), 'prediction'])

In [33]:
df_regressed_2 = data.select(m_features)
df_regressed_2 = df_regressed_2.join(q_press_predicted, q_press_predicted.b_id == df_regressed_2.breath_ids).select(*df_regressed_2.columns, col('prediction').alias('q_pressure'))
df_regressed_2.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+-----------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        m_pressure|       q_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+-----------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 1.712044550044049|4.518401548024195|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274| 5.452311677836793|16.73096260832713|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|2.9888939958281715|7.584112548758221|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 2.781362935991756|18.64874221118432|
|      1238| 50| 50| -4.046307639606464|34.687972375171

# Regression #3 
### 1. Regress the q parameter for pressure 
### 2. Include the predicted value of q in the trianning set when predicting the m

In [34]:
# Append to the original dataset the q_pressure field calculated from the step before
df_q = data.select(features)
df_q = df_q.join(q_press_predicted, q_press_predicted.b_id == df_m.breath_ids).select(*df_q.columns, col('prediction').alias('q_pressure'))
df_q.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+-----------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|       q_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+-----------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503|4.518401548024195|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274|16.73096260832713|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|7.584112548758221|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235|18.64874221118432|
|      1238| 50| 50| -4.046307639606464|34.687972375171306|3.5981683352545537|-4.872868892058087|25.16749778959425|
+----------+---+---+-------------------+------------------+-------------

In [35]:
q_features = ['breath_ids', 'R', 'C', 'm_u_in', 'q_u_in', 'm_u_out', 'q_u_out', 'q_pressure']
m = data.select('m_pressure', col('breath_ids').alias('b_id'))
lr_data = df_q.join(m, m.b_id == df_q.breath_ids).select(*df_q.columns, col('m_pressure').alias('label'))
lr_data.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+-----------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|       q_pressure|             label|
+----------+---+---+-------------------+------------------+------------------+------------------+-----------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503|4.518401548024195| 1.712044550044049|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274|16.73096260832713| 5.452311677836793|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|7.584112548758221|2.9888939958281715|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235|18.64874221118432| 2.781362935991756|
|      1238| 50| 50| -4.046307639606464|34.687972375171

In [36]:
train, test = lr_data.randomSplit([0.8, 0.2], seed=69)

In [37]:
#prepare the data and the pipeline
for_prediction = q_features.copy()
for_prediction.remove('breath_ids')
vectorAssembler = VectorAssembler(inputCols=for_prediction, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
stages = [vectorAssembler, standardScaler, tvs]
pipeline = Pipeline(stages=stages)

## Regression for m_pressure

In [38]:
model = pipeline.fit(train)
prediction = model.transform(test)

In [39]:
print(for_prediction)

['R', 'C', 'm_u_in', 'q_u_in', 'm_u_out', 'q_u_out', 'q_pressure']


In [40]:
performance_metrics(prediction)

RMSE: 1.123
MSE: 1.261
MAE: 0.759
r2: 0.814


In [41]:
prediction.show(5)

+----------+---+---+-------------------+-----------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+------------------+
|breath_ids|  R|  C|             m_u_in|           q_u_in|           m_u_out|           q_u_out|        q_pressure|              label|   unscaled_features|            features|        prediction|
+----------+---+---+-------------------+-----------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+------------------+
|       133|  5| 10|  -19.6014207492924|46.98668700460304| 3.594566274579692| -4.71231980483322|17.067438719442084|  4.694679995581807|[5.0,10.0,-19.601...|[0.25558208403815...| 5.814947611896589|
|       177|  5| 50| -23.32346938584597|71.05051858227964|3.5843312532246725|-4.381444835800677|11.073593311822473| 3.0100866207852106|[5.0,50.0,-23.323...|[0.25558208403815...|3.0335666522515696|
|       193| 20

## Output from the second regression technique

In [42]:
m_pressure = model.transform(lr_data)
m_press_predicted = m_pressure.select([col("breath_ids").alias("b_id"), 'prediction'])

In [43]:
df_regressed_3 = data.select(q_features)
df_regressed_3 = df_regressed_3.join(m_press_predicted, m_press_predicted.b_id == df_regressed_3.breath_ids).select(*df_regressed_3.columns, col('prediction').alias('m_pressure'))
df_regressed_3.show(5)

+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|breath_ids|  R|  C|             m_u_in|            q_u_in|           m_u_out|           q_u_out|        q_pressure|        m_pressure|
+----------+---+---+-------------------+------------------+------------------+------------------+------------------+------------------+
|       148| 50| 10| -0.627586555937512| 3.279187855644647| 3.583992459539968|-4.387057482814503| 4.787830588906654| 1.712044550044049|
|       463| 50| 10| -2.730341695467989|11.496348651324867|3.5980943797891185|-4.875283862177274|14.032851614737082|6.0633168471857415|
|       471| 20| 20|-1.1012242363556006|10.255015772333854| 3.589597364806948|-4.547209213742605|  7.08478856513968| 2.957190054457414|
|       496| 20| 20| -17.10135369945869| 54.39589825365896|3.5900412844154244| -4.54881661950235| 18.63630522318597| 4.006267324414826|
|      1238| 50| 50| -4.046307639606464|34.68797