In [1]:
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as fn
import config as c
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression 

In [2]:
dataset = spark.read.csv(c.input+"/dataset",header=True)

In [3]:
dataset = dataset.drop("source_bunkers")\
                .withColumn("year",fn.col("year").cast("Integer"))                

In [4]:
cols = dataset.drop('country', 'year', 'temperature').columns

In [5]:
dataset = dataset.na.drop()

In [6]:
stringIndexer = StringIndexer(inputCol="country", outputCol="indexed_country")
indexedModel = stringIndexer.fit(dataset)
indexed = indexedModel.transform(dataset)

In [7]:
encoder = OneHotEncoder(inputCol="indexed_country",outputCol="country_coded")
encoded = encoder.transform(indexed)

In [8]:
assembler = (VectorAssembler()
       .setInputCols(["year","country_coded"])
       .setOutputCol("features"))

In [9]:
data_feat = assembler.transform(encoded)

In [10]:
data_final = data_feat.withColumn("year",fn.col("year")+10).where("year > 2013").select("country","year")

In [11]:
for col in cols:
    linearModel = (LinearRegression()
              .setLabelCol(col) 
              .setFeaturesCol("features") 
              .setPredictionCol(col+"_pred")) 
    
    data_feat1 = data_feat.select("country","year","features",col)
    
    data_feat2 = data_feat1.withColumn(col,fn.col(col).cast("Double"))
    
    model = linearModel.fit(data_feat2)

    data_pred = data_feat2.withColumn("year",fn.col("year")+10).where("year > 2013")

    data_pred_1 = data_pred.select("year","country","features")

    stringIndexer1 = StringIndexer(inputCol="country", outputCol="indexed_country")
    indexedModel1 = stringIndexer1.fit(data_pred_1)
    indexed1 = indexedModel1.transform(data_pred_1)

    encoder1 = OneHotEncoder(inputCol="indexed_country",outputCol="country_coded")
    encoded1 = encoder1.transform(indexed1)

    assembler1 = (VectorAssembler()
           .setInputCols(["year","country_coded"])
           .setOutputCol("features"))

    data_feat1 = assembler1.transform(encoded1.select("year","country_coded"))

    new_data = model.transform(data_pred_1)
    
    data_final = data_final.join(new_data.drop("features"),["country","year"])

In [12]:
#Writing dataset
data_final.write.mode("overwrite").jdbc("jdbc:mysql://localhost:3306","work.future_data",
            properties={"user":"hduser","password":"Password@1234"})