In [1]:
# importing spark library to start the session
from pyspark.sql import SparkSession
import random

In [2]:
#Building spark session to start it
spark = SparkSession.builder.appName("project").getOrCreate()

In [3]:
# Load the data set
data = spark.read.csv("project.csv",inferSchema=True,header=True)
data.show()

+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month| PriceTrend|        PEWI|  ALPS|UnitOfMeasure|PriceType|Currency|              sn|
+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|5.686562538|  1.60303044| Alert|           KG|   Retail|     AFN|1_266_84_15_5_87|
|Afghanistan|   267|    Mazar|cereals and tubers|    Wheat|2003|    1|5.714583397|  2.71366787|Crisis|           KG|   Retail|     AFN|1_267_84_15_5_87|
|Afghanistan|   270|    Hirat|cereals and tubers|    Wheat|2003|    1|5.758124828| 0.357086003|Stress|           KG|   Retail|     AFN|1_270_84_15_5_87|
|Afghanistan|   271|    Kabul|cereals and tubers|    Wheat|2003|    1|6.407604218|

In [4]:
#Schema of attributes
data.printSchema()

root
 |-- Country: string (nullable = true)
 |-- mkt_id: integer (nullable = true)
 |-- Market: string (nullable = true)
 |-- CommodityGroup: string (nullable = true)
 |-- Commodity: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- PriceTrend: double (nullable = true)
 |-- PEWI: double (nullable = true)
 |-- ALPS: string (nullable = true)
 |-- UnitOfMeasure: string (nullable = true)
 |-- PriceType: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- sn: string (nullable = true)



In [5]:
#Schema of attributes
data.columns

['Country',
 'mkt_id',
 'Market',
 'CommodityGroup',
 'Commodity',
 'Year',
 'Month',
 'PriceTrend',
 'PEWI',
 'ALPS',
 'UnitOfMeasure',
 'PriceType',
 'Currency',
 'sn']

In [6]:
#Checking the missing values in all the attributes
from pyspark.sql.functions import *
from pyspark.sql.functions import when, count, col
df = data.select(['Country',
 'mkt_id',
 'Market',
 'CommodityGroup',
 'Commodity',
 'Year',
 'Month',
 'PriceTrend',
 'PEWI',
 'ALPS',
 'UnitOfMeasure',
 'PriceType',
 'Currency',
 'sn'])
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

+-------+------+------+--------------+---------+----+-----+----------+----+----+-------------+---------+--------+---+
|Country|mkt_id|Market|CommodityGroup|Commodity|Year|Month|PriceTrend|PEWI|ALPS|UnitOfMeasure|PriceType|Currency| sn|
+-------+------+------+--------------+---------+----+-----+----------+----+----+-------------+---------+--------+---+
|      0|     0|     0|             0|        0|   0|    0|         0|   0|   0|            0|        0|       0|  0|
+-------+------+------+--------------+---------+----+-----+----------+----+----+-------------+---------+--------+---+



In [7]:
#Dropping of Somalia country as it has a very volatile currency and it's affecting the entire prediction. 
data=data.where(data.Country!="Somalia")
data=data.where(data.Year!="1990")
data=data.where(data.Year!="1991")
data=data.where(data.Year!="1992")
data=data.where(data.Year!="1993")
data=data.where(data.Year!="1994")
data=data.where(data.Year!="1995")
data=data.where(data.Year!="1996")
data=data.where(data.Year!="1997")
data=data.where(data.Year!="1998")
data=data.where(data.Year!="1999")
data=data.where(data.Year!="2000")
data=data.where(data.Year!="2001")
data=data.where(data.Year!="2002")
data.show()

+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month| PriceTrend|        PEWI|  ALPS|UnitOfMeasure|PriceType|Currency|              sn|
+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|5.686562538|  1.60303044| Alert|           KG|   Retail|     AFN|1_266_84_15_5_87|
|Afghanistan|   267|    Mazar|cereals and tubers|    Wheat|2003|    1|5.714583397|  2.71366787|Crisis|           KG|   Retail|     AFN|1_267_84_15_5_87|
|Afghanistan|   270|    Hirat|cereals and tubers|    Wheat|2003|    1|5.758124828| 0.357086003|Stress|           KG|   Retail|     AFN|1_270_84_15_5_87|
|Afghanistan|   271|    Kabul|cereals and tubers|    Wheat|2003|    1|6.407604218|

In [8]:
#Removing negative value in our dependent variable PriceTrend
import pyspark.sql.functions as F

data = data.withColumn("only_positive_price_trend", F.when(F.col("PriceTrend") > 0, F.col("PriceTrend")).otherwise(0))

In [9]:
data.show()

+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month| PriceTrend|        PEWI|  ALPS|UnitOfMeasure|PriceType|Currency|              sn|only_positive_price_trend|
+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|5.686562538|  1.60303044| Alert|           KG|   Retail|     AFN|1_266_84_15_5_87|              5.686562538|
|Afghanistan|   267|    Mazar|cereals and tubers|    Wheat|2003|    1|5.714583397|  2.71366787|Crisis|           KG|   Retail|     AFN|1_267_84_15_5_87|              5.714583397|
|Afghanistan|   270|    Hirat|cereals and tubers|    Wheat|2003|    1|5.758124828| 0.357086003|Stress|   

In [10]:
# dimensions of the dataframe
print("Number of Rows: ",data.count() ,"   Number of Columns: ", len(data.columns))

Number of Rows:  461752    Number of Columns:  15


In [11]:
#All the PriceTrend are in the local currencies. Therefore we need to convert them into a single currency i.e. USD. Using the fixer.io API to scrape the latest rates of different currencies
import requests

url = 'http://data.fixer.io/api/latest?access_key=f874f54530d940862712d9c04ba6be4f&base=USD'
response = requests.get(url)
response=response.json()
print(response)

{'success': True, 'timestamp': 1606956245, 'base': 'USD', 'date': '2020-12-03', 'rates': {'AED': 3.6732, 'AFN': 76.950092, 'ALL': 102.28207, 'AMD': 509.140333, 'ANG': 1.795836, 'AOA': 654.816971, 'ARS': 81.498497, 'AUD': 1.350755, 'AWG': 1.8, 'AZN': 1.698212, 'BAM': 1.623486, 'BBD': 2.020106, 'BDT': 84.83476, 'BGN': 1.614595, 'BHD': 0.377059, 'BIF': 1945, 'BMD': 1, 'BND': 1.340766, 'BOB': 6.898192, 'BRL': 5.217601, 'BSD': 0.999959, 'BTC': 5.2093968e-05, 'BTN': 73.817916, 'BWP': 11.085856, 'BYN': 2.593573, 'BYR': 19600, 'BZD': 2.016669, 'CAD': 1.29235, 'CDF': 1969.999538, 'CHF': 0.895195, 'CLF': 0.027398, 'CLP': 756.000255, 'CNY': 6.5634, 'COP': 3522.02, 'CRC': 604.120836, 'CUC': 1, 'CUP': 26.5, 'CVE': 91.374968, 'CZK': 21.797203, 'DJF': 177.720042, 'DKK': 6.14825, 'DOP': 58.30433, 'DZD': 129.164977, 'EGP': 15.661296, 'ERN': 15.000224, 'ETB': 38.250119, 'EUR': 0.825955, 'FJD': 2.069199, 'FKP': 0.748609, 'GBP': 0.748615, 'GEL': 3.325, 'GGP': 0.748609, 'GHS': 5.844959, 'GIP': 0.748609, 'G

In [12]:
# importing the currency value of each currency in United States Dollar (USD)
rate = spark.read.csv("currency_rate_USD.csv",inferSchema=True,header=True)
rate.show()

+--------+----------------------------+
|Currency|Rate ( 1USD = Currency Rate)|
+--------+----------------------------+
|     AED|                    3.673042|
|     AFN|                   77.000368|
|     ALL|                  103.650403|
|     AMD|                  508.210403|
|     ANG|                    1.794919|
|     AOA|                  652.645041|
|     ARS|                    80.99534|
|     AUD|                     1.35373|
|     AWG|                         1.8|
|     AZN|                     1.70397|
|     BAM|                     1.64037|
|     BBD|                    2.019022|
|     BDT|                   84.792881|
|     BGN|                    1.635255|
|     BHD|                    0.377056|
|     BIF|                      1942.0|
|     BMD|                         1.0|
|     BND|                     1.33841|
|     BOB|                    6.904832|
|     BRL|                    5.344104|
+--------+----------------------------+
only showing top 20 rows



In [13]:
#Converting United States Dollar (USD) in Local Currency i.e. 1 Currency Rate = (1/Currency Rate)USD
import pyspark.sql.functions as F

rate = rate.withColumn("rate_USD_in_local_currency", 1/F.col("Rate ( 1USD = Currency Rate)"))
rate.show()

+--------+----------------------------+--------------------------+
|Currency|Rate ( 1USD = Currency Rate)|rate_USD_in_local_currency|
+--------+----------------------------+--------------------------+
|     AED|                    3.673042|       0.27225389745066897|
|     AFN|                   77.000368|      0.012986950919507296|
|     ALL|                  103.650403|       0.00964781584110194|
|     AMD|                  508.210403|      0.001967688961298...|
|     ANG|                    1.794919|        0.5571282046710743|
|     AOA|                  652.645041|      0.001532226458761...|
|     ARS|                    80.99534|      0.012346389310792448|
|     AUD|                     1.35373|         0.738699740716391|
|     AWG|                         1.8|        0.5555555555555556|
|     AZN|                     1.70397|        0.5868647922205203|
|     BAM|                     1.64037|        0.6096185616659656|
|     BBD|                    2.019022|        0.4952893034350

In [14]:
#Joining the data table and the rate conversion table by defining respective variable
ta = data.alias('ta')
tb = rate.alias('tb')

In [15]:
#using join to join the two tables
data = ta.join(tb, ta.Currency == tb.Currency)
data.show()

+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+--------+----------------------------+--------------------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month| PriceTrend|        PEWI|  ALPS|UnitOfMeasure|PriceType|Currency|              sn|only_positive_price_trend|Currency|Rate ( 1USD = Currency Rate)|rate_USD_in_local_currency|
+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+--------+----------------------------+--------------------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|5.686562538|  1.60303044| Alert|           KG|   Retail|     AFN|1_266_84_15_5_87|              5.686562538|     AFN|                   77.000368|      0.012986950919507296|
|Afghanistan|   267|    

In [16]:
data.printSchema()

root
 |-- Country: string (nullable = true)
 |-- mkt_id: integer (nullable = true)
 |-- Market: string (nullable = true)
 |-- CommodityGroup: string (nullable = true)
 |-- Commodity: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- PriceTrend: double (nullable = true)
 |-- PEWI: double (nullable = true)
 |-- ALPS: string (nullable = true)
 |-- UnitOfMeasure: string (nullable = true)
 |-- PriceType: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- sn: string (nullable = true)
 |-- only_positive_price_trend: double (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Rate ( 1USD = Currency Rate): double (nullable = true)
 |-- rate_USD_in_local_currency: double (nullable = true)



In [17]:
#Creating a new column to convert the existing attribute PriceTrend in USD
data = data.withColumn("price_trend_USD", F.col("only_positive_price_trend") * F.col("rate_USD_in_local_currency"))
data.show()

+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+--------+----------------------------+--------------------------+-------------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month| PriceTrend|        PEWI|  ALPS|UnitOfMeasure|PriceType|Currency|              sn|only_positive_price_trend|Currency|Rate ( 1USD = Currency Rate)|rate_USD_in_local_currency|    price_trend_USD|
+-----------+------+---------+------------------+---------+----+-----+-----------+------------+------+-------------+---------+--------+----------------+-------------------------+--------+----------------------------+--------------------------+-------------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|5.686562538|  1.60303044| Alert|           KG|   Retail|     AFN|1_266_84_15_5_87|              5.686562538|     AFN|                   77

In [18]:
#Dropping the columns which are of no use now
columns_to_drop = ['UnitOfMeasure','Currency','sn','PriceTrend','only_positive_price_trend','Rate ( 1USD = Currency Rate)','rate_USD_in_local_currency']
data = data.drop(*columns_to_drop)
data.show()

+-----------+------+---------+------------------+---------+----+-----+------------+------+---------+-------------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month|        PEWI|  ALPS|PriceType|    price_trend_USD|
+-----------+------+---------+------------------+---------+----+-----+------------+------+---------+-------------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|  1.60303044| Alert|   Retail|0.07385110858171484|
|Afghanistan|   267|    Mazar|cereals and tubers|    Wheat|2003|    1|  2.71366787|Crisis|   Retail|0.07421501410227027|
|Afghanistan|   270|    Hirat|cereals and tubers|    Wheat|2003|    1| 0.357086003|Stress|   Retail|0.07478048452963239|
|Afghanistan|   271|    Kabul|cereals and tubers|    Wheat|2003|    1| 0.716033936|Stress|   Retail|0.08321524149079393|
|Afghanistan|   272| Kandahar|cereals and tubers|    Wheat|2003|    1|-0.842860222|Normal|   Retail|0.10809607677459414|
|Afghanistan|   273|Jalalabad|ce

In [19]:
#Statistical data description
data.describe().show()

+-------+-----------+-----------------+-------+--------------------+-------------+-----------------+------------------+--------------------+------+---------+------------------+
|summary|    Country|           mkt_id| Market|      CommodityGroup|    Commodity|             Year|             Month|                PEWI|  ALPS|PriceType|   price_trend_USD|
+-------+-----------+-----------------+-------+--------------------+-------------+-----------------+------------------+--------------------+------+---------+------------------+
|  count|     461752|           461752| 461752|              461752|       461752|           461752|            461752|              461752|461752|   461752|            461752|
|   mean|       null|886.2232973544241|   null|                null|         null|2013.347836067846|6.4094665534745925|-0.01090965834247...|  null|     null| 5.476171841697306|
| stddev|       null|658.1508771875784|   null|                null|         null|4.023248752925489|3.4480120528264

In [20]:
#Building pipeline and converting Categorical Variables to Numerical Variables
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) for column in list(set(data.columns)-set(['Country','mkt_id','Market','Commodity','Year','Month','PEWI','price_trend_USD'])) ]


pipeline = Pipeline(stages=indexers)
data = pipeline.fit(data).transform(data)

data.show()

+-----------+------+---------+------------------+---------+----+-----+------------+------+---------+-------------------+----------+--------------------+---------------+
|    Country|mkt_id|   Market|    CommodityGroup|Commodity|Year|Month|        PEWI|  ALPS|PriceType|    price_trend_USD|ALPS_index|CommodityGroup_index|PriceType_index|
+-----------+------+---------+------------------+---------+----+-----+------------+------+---------+-------------------+----------+--------------------+---------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|    Wheat|2003|    1|  1.60303044| Alert|   Retail|0.07385110858171484|       2.0|                 0.0|            0.0|
|Afghanistan|   267|    Mazar|cereals and tubers|    Wheat|2003|    1|  2.71366787|Crisis|   Retail|0.07421501410227027|       3.0|                 0.0|            0.0|
|Afghanistan|   270|    Hirat|cereals and tubers|    Wheat|2003|    1| 0.357086003|Stress|   Retail|0.07478048452963239|       1.0|                 0.0|   

In [21]:
data.describe().show()

+-------+-----------+-----------------+-------+--------------------+-------------+-----------------+------------------+--------------------+------+---------+------------------+------------------+--------------------+-------------------+
|summary|    Country|           mkt_id| Market|      CommodityGroup|    Commodity|             Year|             Month|                PEWI|  ALPS|PriceType|   price_trend_USD|        ALPS_index|CommodityGroup_index|    PriceType_index|
+-------+-----------+-----------------+-------+--------------------+-------------+-----------------+------------------+--------------------+------+---------+------------------+------------------+--------------------+-------------------+
|  count|     461752|           461752| 461752|              461752|       461752|           461752|            461752|              461752|461752|   461752|            461752|            461752|              461752|             461752|
|   mean|       null|886.2232973544241|   null|     

In [22]:
#dropping of columns to prepare for predictions
columns_to_drop = ['Commodity', 'PriceTrend','ALPS','PriceType','Month','id','Rate ( 1USD = Currency Rate)', 'rate_USD_in_local_currency']
data = data.drop(*columns_to_drop)
data.show()

+-----------+------+---------+------------------+----+------------+-------------------+----------+--------------------+---------------+
|    Country|mkt_id|   Market|    CommodityGroup|Year|        PEWI|    price_trend_USD|ALPS_index|CommodityGroup_index|PriceType_index|
+-----------+------+---------+------------------+----+------------+-------------------+----------+--------------------+---------------+
|Afghanistan|   266| Fayzabad|cereals and tubers|2003|  1.60303044|0.07385110858171484|       2.0|                 0.0|            0.0|
|Afghanistan|   267|    Mazar|cereals and tubers|2003|  2.71366787|0.07421501410227027|       3.0|                 0.0|            0.0|
|Afghanistan|   270|    Hirat|cereals and tubers|2003| 0.357086003|0.07478048452963239|       1.0|                 0.0|            0.0|
|Afghanistan|   271|    Kabul|cereals and tubers|2003| 0.716033936|0.08321524149079393|       1.0|                 0.0|            0.0|
|Afghanistan|   272| Kandahar|cereals and tubers

In [23]:
data.describe().show()

+-------+-----------+-----------------+-------+--------------------+-----------------+--------------------+------------------+------------------+--------------------+-------------------+
|summary|    Country|           mkt_id| Market|      CommodityGroup|             Year|                PEWI|   price_trend_USD|        ALPS_index|CommodityGroup_index|    PriceType_index|
+-------+-----------+-----------------+-------+--------------------+-----------------+--------------------+------------------+------------------+--------------------+-------------------+
|  count|     461752|           461752| 461752|              461752|           461752|              461752|            461752|            461752|              461752|             461752|
|   mean|       null|886.2232973544241|   null|                null|2013.347836067846|-0.01090965834247...| 5.476171841697306|0.5977884232228555|  0.3688971569154005|0.11431244477555051|
| stddev|       null|658.1508771875784|   null|                nu

## Assembling the attributes

In [24]:
#importing the assembly libraries
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [25]:
#Checking the columns
data.columns

['Country',
 'mkt_id',
 'Market',
 'CommodityGroup',
 'Year',
 'PEWI',
 'price_trend_USD',
 'ALPS_index',
 'CommodityGroup_index',
 'PriceType_index']

In [26]:
#Creating a features attribute to assemble all input columns into one features column
assembler = VectorAssembler(
    inputCols=['mkt_id','PEWI','Year','CommodityGroup_index',
 'ALPS_index',
 'PriceType_index'],
    outputCol="features")
data = assembler.transform(data)

In [27]:
data.describe().show()

+-------+-----------+-----------------+-------+--------------------+-----------------+--------------------+------------------+------------------+--------------------+-------------------+
|summary|    Country|           mkt_id| Market|      CommodityGroup|             Year|                PEWI|   price_trend_USD|        ALPS_index|CommodityGroup_index|    PriceType_index|
+-------+-----------+-----------------+-------+--------------------+-----------------+--------------------+------------------+------------------+--------------------+-------------------+
|  count|     461752|           461752| 461752|              461752|           461752|              461752|            461752|            461752|              461752|             461752|
|   mean|       null|886.2232973544241|   null|                null|2013.347836067846|-0.01090965834247...| 5.476171841697306|0.5977884232228555|  0.3688971569154005|0.11431244477555051|
| stddev|       null|658.1508771875784|   null|                nu

### Splitting the train and test data

In [28]:
#Splitting training and testing dataset
train_set, test_set = data.randomSplit([0.7,.3])

## Linear Regression

In [29]:
#importing the machine learning package
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [30]:
# Fit training data of linear regression model
lreg_model = LinearRegression(featuresCol='features', labelCol='price_trend_USD', predictionCol='prediction')
fitlreg = lreg_model.fit(train_set)

In [31]:
# Print the equation of the developed linear regression
print("Coefficients: {} Intercept: {}".format(fitlreg.coefficients,fitlreg.intercept))

Coefficients: [-0.0012770436457039435,-0.6766638419299567,0.4371965668207353,0.6410056762714059,0.6069219013574398,40.54904502387377] Intercept: -878.8609573554515


In [32]:
#fitting testing data of linear regression model
test_pred = fitlreg.evaluate(test_set)
test_pred.residuals.show()

+------------------+
|         residuals|
+------------------+
| 3.140759872718061|
|3.4848015345195282|
|3.7373020803687993|
|2.9237879260370057|
| 2.741282524992972|
|1.7012074481418415|
| 2.802037666889345|
|1.4648636285694634|
|1.8431994362748476|
|1.8687029156284447|
| 2.196117237552765|
| 2.460006698790107|
|1.8138226159478945|
|1.9067993914453374|
|1.7649995746695208|
| 2.233659338904829|
|1.9987396587916892|
| 2.325134210154372|
|1.5676399811807966|
|1.5311583101591721|
+------------------+
only showing top 20 rows



In [33]:
#Checking the prediction of the model on test dataset
test_prediction = fitlreg.transform(test_set)
test_prediction.show()

+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+-------------------+
|    Country|mkt_id|  Market|    CommodityGroup|Year|        PEWI|    price_trend_USD|ALPS_index|CommodityGroup_index|PriceType_index|            features|         prediction|
+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+-------------------+
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.673173606| 0.1003444879640056|       0.0|                 0.0|            0.0|[266.0,-0.6731736...|-3.0404153847540556|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.148812696|0.08957008184688157|       0.0|                 0.0|            0.0|[266.0,-0.1488126...| -3.395231452672647|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003| 0.225916818|0.08850471506837475|       0.0|                 0.0|  

In [34]:
test_prediction.select("Country", "market", "Year", "CommodityGroup", "features", "Price_trend_USD", "prediction").show()

+-----------+--------+----+------------------+--------------------+-------------------+-------------------+
|    Country|  market|Year|    CommodityGroup|            features|    Price_trend_USD|         prediction|
+-----------+--------+----+------------------+--------------------+-------------------+-------------------+
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.6731736...| 0.1003444879640056|-3.0404153847540556|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.1488126...|0.08957008184688157| -3.395231452672647|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,0.22591681...|0.08850471506837475|-3.6487973653004246|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,1.40152335...|0.13054049637788745| -2.793247429659118|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,2.02803874...| 0.1310166872189494|-2.6102658377740227|
|Afghanistan|Fayzabad|2005|cereals and tubers|[266.0,-1.6009962...|0.16300923600780715|-1.5381982121340343|
|Afghanistan|Fayzabad|2005|c

In [35]:
#Creating a evaluator to calcuate Root Mean squared Error (RMSE) value using regression evaluator
evaluator = RegressionEvaluator(
    labelCol="price_trend_USD", predictionCol="prediction", metricName="rmse")

In [36]:
rmse = evaluator.evaluate(test_prediction)

In [37]:
#Printing RMSE value
print("Root Mean Squared Error (RMSE) for linear regression on test data = %s" % rmse)

Root Mean Squared Error (RMSE) for linear regression on test data = 39.222982019544254


In [38]:
print("Normalized Root Mean Squared Error for Linear Regresssor =", rmse/5.48)

Normalized Root Mean Squared Error for Linear Regresssor = 7.157478470719754


#Decision Tree Regression

In [39]:
#importing ml packages for decision tree regressor
from pyspark.ml.regression import DecisionTreeRegressor

In [40]:
#Running Decision Tree Regression model
dtree_model = DecisionTreeRegressor(featuresCol='features', labelCol='price_trend_USD', predictionCol='prediction')

In [41]:
#fitting decision tree regression model on training dataset
fitdtree = dtree_model.fit(train_set)

In [42]:
#transforming the fit training dataset on the testing dataset
test_prediction = fitdtree.transform(test_set)
test_prediction.show()

+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+-------------------+
|    Country|mkt_id|  Market|    CommodityGroup|Year|        PEWI|    price_trend_USD|ALPS_index|CommodityGroup_index|PriceType_index|            features|         prediction|
+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+-------------------+
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.673173606| 0.1003444879640056|       0.0|                 0.0|            0.0|[266.0,-0.6731736...|0.36005383321149653|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.148812696|0.08957008184688157|       0.0|                 0.0|            0.0|[266.0,-0.1488126...|0.36005383321149653|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003| 0.225916818|0.08850471506837475|       0.0|                 0.0|  

In [43]:
#Checking the prediction value on the test dataset
test_prediction.select("Country", "market", "Year", "CommodityGroup", "features", "Price_trend_USD", "prediction").show()

+-----------+--------+----+------------------+--------------------+-------------------+-------------------+
|    Country|  market|Year|    CommodityGroup|            features|    Price_trend_USD|         prediction|
+-----------+--------+----+------------------+--------------------+-------------------+-------------------+
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.6731736...| 0.1003444879640056|0.36005383321149653|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.1488126...|0.08957008184688157|0.36005383321149653|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,0.22591681...|0.08850471506837475|0.36005383321149653|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,1.40152335...|0.13054049637788745|0.36005383321149653|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,2.02803874...| 0.1310166872189494|0.36005383321149653|
|Afghanistan|Fayzabad|2005|cereals and tubers|[266.0,-1.6009962...|0.16300923600780715|0.36005383321149653|
|Afghanistan|Fayzabad|2005|c

In [44]:
#Creating a evaluator to calcuate Root Mean squared Error (RMSE) value using regression evaluator
evaluator = RegressionEvaluator(
    labelCol="price_trend_USD", predictionCol="prediction", metricName="rmse")

In [45]:
#Calculating RMSE value
rmse = float(evaluator.evaluate(test_prediction))

In [46]:
#Printing RMSE value
print("Root Mean Squared Error (RMSE) for Decision Tree Regressor on test data = %g" % rmse)

Root Mean Squared Error (RMSE) for Decision Tree Regressor on test data = 24.8915


In [47]:
print("Normalized Root Mean Squared Error for Decision Tree Regresssor =", rmse/5.48)

Normalized Root Mean Squared Error for Decision Tree Regresssor = 4.542239419524407


## Random Forest Regression

In [48]:
#importing ml pacakges for Random Forest Regression
from pyspark.ml.regression import RandomForestRegressor

In [49]:
#Running Random Forest Regression model
rforest_model = RandomForestRegressor(labelCol="price_trend_USD", featuresCol="features", predictionCol='prediction')

In [50]:
#fitting Random Forest Regression model on training dataset
fitrforest = rforest_model.fit(train_set)

In [51]:
#transforming the fit training dataset on the testing dataset
test_prediction = fitrforest.transform(test_set)
test_prediction.show()

+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+------------------+
|    Country|mkt_id|  Market|    CommodityGroup|Year|        PEWI|    price_trend_USD|ALPS_index|CommodityGroup_index|PriceType_index|            features|        prediction|
+-----------+------+--------+------------------+----+------------+-------------------+----------+--------------------+---------------+--------------------+------------------+
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.673173606| 0.1003444879640056|       0.0|                 0.0|            0.0|[266.0,-0.6731736...| 1.440945509830318|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003|-0.148812696|0.08957008184688157|       0.0|                 0.0|            0.0|[266.0,-0.1488126...|1.4590733308044688|
|Afghanistan|   266|Fayzabad|cereals and tubers|2003| 0.225916818|0.08850471506837475|       0.0|                 0.0|       

In [52]:
#Checking the prediction value on the test dataset
test_prediction.select("Country", "market", "Year", "CommodityGroup", "features", "Price_trend_USD", "prediction").show()

+-----------+--------+----+------------------+--------------------+-------------------+------------------+
|    Country|  market|Year|    CommodityGroup|            features|    Price_trend_USD|        prediction|
+-----------+--------+----+------------------+--------------------+-------------------+------------------+
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.6731736...| 0.1003444879640056| 1.440945509830318|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,-0.1488126...|0.08957008184688157|1.4590733308044688|
|Afghanistan|Fayzabad|2003|cereals and tubers|[266.0,0.22591681...|0.08850471506837475|1.4590733308044688|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,1.40152335...|0.13054049637788745|1.4564282392127113|
|Afghanistan|Fayzabad|2004|cereals and tubers|[266.0,2.02803874...| 0.1310166872189494|1.4590733308044688|
|Afghanistan|Fayzabad|2005|cereals and tubers|[266.0,-1.6009962...|0.16300923600780715|1.0517017426195867|
|Afghanistan|Fayzabad|2005|cereals an

In [53]:
#Creating a evaluator to calcuate Root Mean squared Error (RMSE) value using regression evaluator
evaluator = RegressionEvaluator(
    labelCol="price_trend_USD", predictionCol="prediction", metricName="rmse")

In [54]:
#Calculating RMSE value
rmse = float(evaluator.evaluate(test_prediction))

In [55]:
#Printing RMSE value
print("Root Mean Squared Error (RMSE) for Random Forest Regressor on test data = %g" % rmse)

Root Mean Squared Error (RMSE) for Random Forest Regressor on test data = 29.797


In [56]:
print("Normalized Root Mean Squared Error for Random Forest Regresssor =", rmse/5.48)

Normalized Root Mean Squared Error for Random Forest Regresssor = 5.437415999073002
