In [1]:
from pyspark.sql.session import SparkSession
import config as c
import pyspark.sql.functions as fn
from pyspark.ml.feature import VectorAssembler, Imputer
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
#Reading dataset
dataset = spark.read.jdbc("jdbc:mysql://localhost:3306","work.data",
            properties={"user":"hduser","password":"Password@1234"})

In [3]:
dataset = dataset.na.drop()

In [4]:
#Getting feature columns
nonFeatureCols = ['country','temperature','year']
featureCols = [item for item in dataset.columns if item not in nonFeatureCols]

In [5]:
#Getting assembler to generate feature column
assembler = (VectorAssembler()
           .setInputCols(featureCols)
           .setOutputCol("features"))

In [6]:
data_feat = assembler.transform(dataset)
data_feat.show(10)

+------------+----+------------------+--------------------------+-----------------------+--------------------------+----------------+------------------+--------------------+------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+
|     country|year|       temperature|annual_co2_emission_tonnes|annual_co2_emission_gdp|annual_co2_emission_capita|annual_co2_share|     source_others|        source_waste|   source_industry|      source_res_com|    source_transport|source_agriculture|    source_forestry|         source_land|       source_energy|            features|
+------------+----+------------------+--------------------------+-----------------------+--------------------------+----------------+------------------+--------------------+------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+-----------

In [7]:
model = CrossValidatorModel.load(c.output+"/model")

In [8]:
predictions = model.transform(data_feat)

In [9]:
predictions.drop('features').write.mode("overwrite").jdbc("jdbc:mysql://localhost:3306","work.predictions",
                    properties={"user":"hduser","password":"Password@1234"})

### Future Predictions

In [10]:
#Reading dataset
dataset_fut = spark.read.jdbc("jdbc:mysql://localhost:3306","work.future_data",
            properties={"user":"hduser","password":"Password@1234"})

In [11]:
#Getting feature columns
nonFeatureCols = ['country','year']
featureCols = [item for item in dataset_fut.columns if item not in nonFeatureCols]

In [12]:
#Getting assembler to generate feature column
assembler = (VectorAssembler()
           .setInputCols(featureCols)
           .setOutputCol("features"))

In [13]:
data_feat1 = assembler.transform(dataset_fut)

In [14]:
predictions1 = model.transform(data_feat1)

In [15]:
predictions1.drop('features').write.mode("overwrite").jdbc("jdbc:mysql://localhost:3306","work.predictions_future",
                    properties={"user":"hduser","password":"Password@1234"})