In [1]:
from pyspark.sql.session import SparkSession 
from pyspark.ml import Pipeline 
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
import pyspark.sql.functions as fn
from pyspark.ml.feature import Imputer
import config as c

In [2]:
#Reading dataset
data = spark.read.csv(c.input+"/dataset", header=True, inferSchema=True)

In [3]:
data = data.drop("source_bunkers")

In [4]:
#Rounding all columns
for col in data.drop("country").columns:
    data = data.withColumn(col,fn.round(col,1))

In [5]:
#Getting feature columns
nonFeatureCols = ['country','temperature','year']
featureCols = [item for item in data.columns if item not in nonFeatureCols]

In [6]:
#Imputing nulls
imputer = Imputer(
            inputCols = featureCols,
            outputCols = featureCols)

In [7]:
data_imputed = imputer.fit(data).transform(data)

In [8]:
data_imputed.show()

+--------------------+----+-----------+--------------------------+-----------------------+--------------------------+----------------+-------------+-------------+---------------+--------------+----------------+------------------+---------------+-------------+--------------+
|             country|year|temperature|annual_co2_emission_tonnes|annual_co2_emission_gdp|annual_co2_emission_capita|annual_co2_share|source_others| source_waste|source_industry|source_res_com|source_transport|source_agriculture|source_forestry|  source_land| source_energy|
+--------------------+----+-----------+--------------------------+-----------------------+--------------------------+----------------+-------------+-------------+---------------+--------------+----------------+------------------+---------------+-------------+--------------+
|             Algeria|2002|       24.3|              8.94067212E7|                    0.3|                       2.8|             0.4|     352842.1|    4845611.2|      6755974

In [9]:
#Getting assembler to generate feature column
assembler = (VectorAssembler()
           .setInputCols(featureCols)
           .setOutputCol("features"))

In [10]:
data_feat = assembler.transform(data_imputed)

In [11]:
#Inintialing linear regression model
linearModel = (LinearRegression()
              .setLabelCol("temperature") 
              .setFeaturesCol("features") 
              .setPredictionCol("prediction")) 

In [12]:
#Generating a pipelined model
# pipelineModel = Pipeline(stages=[assembler,linearModel])

In [13]:
#Making a parameter Grid for Hyper Parameter tuning
paramGrid = (ParamGridBuilder()
            .addGrid(linearModel.maxIter, [5,10,100,500,1000])
            .addGrid(linearModel.regParam, [0.01,0.1,1.0,10.0])
            .addGrid(linearModel.elasticNetParam,[0.1,0.5,1.0])
            .build())

In [14]:
#Initialization of parameterized model
crossValidator = (CrossValidator() #building our model
                 .setEstimator(linearModel) #Model we will be using
                 .setEstimatorParamMaps(paramGrid) #Hyper parameters specified
                 .setEvaluator(RegressionEvaluator().setLabelCol("temperature"))) #prediction

In [15]:
#Splitting training and test dataset
training, test = data_feat.drop('country','year').randomSplit([0.8,0.2])

In [16]:
model = crossValidator.fit(training) #Running the creation of model

In [17]:
predictions = model.transform(test) #Get the predictions

In [18]:
rmse = (RegressionEvaluator()
                 .setPredictionCol("prediction") #prediction column
                 .setLabelCol("temperature") #column which we wanted to predict
                 .setMetricName("rmse")) #Root mean square error

In [19]:
rmse_value = rmse.evaluate(predictions)

In [20]:
r2 = (RegressionEvaluator()
                 .setPredictionCol("prediction")
                 .setLabelCol("temperature")
                 .setMetricName("r2"))

In [21]:
r2_value = r2.evaluate(predictions)

In [22]:
print("Root mean square error value : ",rmse_value)
print("R square value : ",r2_value)

# Root mean square error value :  7.805393534253261
# R square value :  0.18991747249989088

Root mean square error value :  7.5889764052314375
R square value :  0.21688153517745956


In [23]:
model.write().overwrite().save(c.output+"/model")