In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# https://towardsdatascience.com/magic-commands-for-profiling-in-jupyter-notebook-d2ef00e29a63
!pip3 install memory_profiler
%load_ext memory_profiler

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [None]:
# spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, TimestampType
from pyspark.sql import functions as F
from pyspark.sql.functions import col,isnan,when,count,lit
from pyspark.ml.feature import Imputer
from pyspark.sql import Window
from pyspark.sql.functions import month,year,dayofmonth, hour, minute, second


In [None]:
spark = SparkSession.builder.appName('ghorPred').getOrCreate()
spark

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
crop= spark.read.options(delimiter=',').csv('/content/drive/MyDrive/vslab/final/ghorProd2.csv', header=True, inferSchema=True)

In [None]:
crop.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Tomatoes: double (nullable = true)
 |-- Okra: double (nullable = true)
 |-- Onion dry: double (nullable = true)



In [None]:
crop.show()

+---------+------+----+--------+-----+---------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|
+---------+------+----+--------+-----+---------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|
|Ghor Safi|Winter|2018|120710.8|  0.0|  18922.2|
|Ghor Safi|Winter|2019| 98611.6|488.0|  13983.8|
|Ghor Safi|Winter|2020| 81196.5|  0.0|  26088.8|
|Ghor Safi|Winter|2021|119528.9|  0.0|  15484.4|
+---------+------+----+--------+-----+---------+



In [None]:
crop.describe().show()

+-------+---------+------+------------------+-----------------+----------------+------------------+
|summary|   Region|Season|              Year|         Tomatoes|            Okra|         Onion dry|
+-------+---------+------+------------------+-----------------+----------------+------------------+
|  count|       10|    10|                10|               10|              10|                10|
|   mean|     null|  null|            2019.0|          67111.6|          166.18|10107.640000000001|
| stddev|     null|  null|1.4907119849998527|38188.87401953611|187.537544685502| 8907.867446900833|
|    min|Ghor Safi|Summer|              2017|          17317.6|             0.0|             565.7|
|    max|Ghor Safi|Winter|              2021|         120710.8|           488.0|           26088.8|
+-------+---------+------+------------------+-----------------+----------------+------------------+



In [None]:
ghorSchema = StructType([

 StructField('Region', StringType(), True),
 StructField('Date/Time', TimestampType(),True),
 StructField('AirDewPoint', IntegerType(),True),
 StructField('AirTemperature', IntegerType(),True),
 StructField('Humidity%', IntegerType(),True),
 StructField('ManualPresentWeather', StringType(),True),
 StructField('CloudType', StringType(),True),
 StructField('CloudsCover(Okta)', IntegerType(),True),
 StructField('CloudsCover%', IntegerType(),True),
 StructField('WindDirection(Degrees)', IntegerType(), True),
 StructField('WindSpeed', IntegerType(), True),
 StructField('WindType', StringType(), True)
 ])
ghorDf = spark.read.csv('/content/drive/MyDrive/vslab/final/GhorSafi.csv', header=True, schema=ghorSchema)

In [None]:
ghorDf.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Date/Time: timestamp (nullable = true)
 |-- AirDewPoint: integer (nullable = true)
 |-- AirTemperature: integer (nullable = true)
 |-- Humidity%: integer (nullable = true)
 |-- ManualPresentWeather: string (nullable = true)
 |-- CloudType: string (nullable = true)
 |-- CloudsCover(Okta): integer (nullable = true)
 |-- CloudsCover%: integer (nullable = true)
 |-- WindDirection(Degrees): integer (nullable = true)
 |-- WindSpeed: integer (nullable = true)
 |-- WindType: string (nullable = true)



In [None]:
ghorDf.show()

+------------+-------------------+-----------+--------------+---------+--------------------+---------+-----------------+------------+----------------------+---------+--------+
|      Region|          Date/Time|AirDewPoint|AirTemperature|Humidity%|ManualPresentWeather|CloudType|CloudsCover(Okta)|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|
+------------+-------------------+-----------+--------------+---------+--------------------+---------+-----------------+------------+----------------------+---------+--------+
|Ghor El Safi|2017-01-02 06:00:00|          6|            10|       75|                null|  cumulus|                3|          38|                  null|        0|    calm|
|Ghor El Safi|2017-01-02 12:00:00|          7|            20|       42|                null|  cumulus|                1|          13|                  null|        0|    calm|
|Ghor El Safi|2017-01-03 06:00:00|          6|            11|       71|                null|     null|             null|

In [None]:
ghorDf.describe().show()

+-------+------------+------------------+------------------+------------------+--------------------+-----------+------------------+-----------------+----------------------+------------------+--------+
|summary|      Region|       AirDewPoint|    AirTemperature|         Humidity%|ManualPresentWeather|  CloudType| CloudsCover(Okta)|     CloudsCover%|WindDirection(Degrees)|         WindSpeed|WindType|
+-------+------------+------------------+------------------+------------------+--------------------+-----------+------------------+-----------------+----------------------+------------------+--------+
|  count|        6959|               886|              6854|               886|                 467|        445|               444|             6959|                  4667|              6956|    6958|
|   mean|        null|14.705417607223476|27.048001167201633|51.635440180586905|                null|       null|2.6013513513513513|2.091248742635436|    187.74373259052925|1.6899079930994825|    n

In [None]:
numeric_vals = [col for col, dtype in ghorDf.dtypes if dtype != "string" and dtype != "timestamp"]
df_numeric = ghorDf.select(numeric_vals)
df_numeric.show(2)

+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|AirDewPoint|AirTemperature|Humidity%|CloudsCover(Okta)|CloudsCover%|WindDirection(Degrees)|WindSpeed|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|          6|            10|       75|                3|          38|                  null|        0|
|          7|            20|       42|                1|          13|                  null|        0|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+
only showing top 2 rows



In [None]:
df_numeric.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_numeric.columns]).show()
print( df_numeric.count())

+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|AirDewPoint|AirTemperature|Humidity%|CloudsCover(Okta)|CloudsCover%|WindDirection(Degrees)|WindSpeed|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|       6073|           105|     6073|             6515|           0|                  2292|        3|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+

6959


In [None]:
ghor_weather=ghorDf.copy()

In [None]:
ghor_weather = ghor_weather.na.drop(how='any', subset=['AirTemperature','WindSpeed'],  thresh=2)

In [None]:
ghor_weather.count()

6852

In [None]:
numeric_vals = [col for col, dtype in ghor_weather.dtypes if dtype != "string" and dtype != "timestamp"]
df_numeric = ghor_weather.select(numeric_vals)
df_numeric.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_numeric.columns]).show()
print( df_numeric.count())

+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|AirDewPoint|AirTemperature|Humidity%|CloudsCover(Okta)|CloudsCover%|WindDirection(Degrees)|WindSpeed|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|       5967|             0|     5967|             6409|           0|                  2190|        0|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+

6852


In [None]:
ghor_weather = ghor_weather.fillna(0, subset=["WindDirection(Degrees)"])


In [None]:
numeric_vals = [col for col, dtype in ghor_weather.dtypes if dtype != "string" and dtype != "timestamp"]
df_numeric = ghor_weather.select(numeric_vals)
df_numeric.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_numeric.columns]).show()
print( df_numeric.count())

+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|AirDewPoint|AirTemperature|Humidity%|CloudsCover(Okta)|CloudsCover%|WindDirection(Degrees)|WindSpeed|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+
|       5967|             0|     5967|             6409|           0|                     0|        0|
+-----------+--------------+---------+-----------------+------------+----------------------+---------+

6852


In [None]:
ghor_weather=ghor_weather.drop('AirDewPoint','Humidity%','ManualPresentWeather','CloudType','CloudsCover(Okta)')

In [None]:
ghor_weather.show()

+------------+-------------------+--------------+------------+----------------------+---------+--------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|
+------------+-------------------+--------------+------------+----------------------+---------+--------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|
|Ghor El Safi|2017-01-03 12:00:00|            20|          13|                     0|        0|    calm|
|Ghor El Safi|2017-01-04 12:00:00|            20|          13|                     0|        0|    calm|
|Ghor El Safi|2017-01-05 06:00:00|             9|          25|                     0|        0|    calm|
|Ghor El Safi|2017-01-05 12:00:00|            21|      

In [None]:
ghor_weather = ghor_weather \
    .withColumn("Year", year("Date/Time")) \
    .withColumn("Month", month("Date/Time")) \
    .withColumn("Day", dayofmonth("Date/Time")) \


# Show the resulting DataFrame
ghor_weather.show()
ghor_weather.count()

+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017|    1|  3|
|Ghor El Safi|2017-01-03 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  3|
|Ghor El Safi|2017-01-04 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  4|
|Ghor El Safi|2017-01-05 06:00:00|      

6852

In [None]:
temp=ghor_weather.withColumn("Season",
   F.when((F.col("Month") >=4) & (F.col("Month")  <= 9), 'Summer')
    .when((F.col("Month") <= 3) | (F.col("Month") >= 10), 'Winter')

)

temp.show(20)

+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017|    1|  3|Winter|
|Ghor El Safi|2017-01-03 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  3|Winter|
|Ghor El Safi|2017-01-04 12:00:00|            20|          13|                     0|        0|    calm|2017|  

In [None]:
temp=temp.withColumn("bad_wind",
   F.when((F.col("WindSpeed") >=5),1 )
    .otherwise(0)

)

temp.show(3)
temp.count()

+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|bad_wind|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|       0|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|       0|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017|    1|  3|Winter|       0|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+
only showing top 3 rows



6852

In [None]:
bad_wind_days=temp.groupBy('Day',"Month","Year",'Season').agg(F.max('bad_wind').alias('bad_wind')).orderBy('Year','Month','Day','Season', ascending=True)
bad_wind_days.show(3)
bad_wind_days.count()

+---+-----+----+------+--------+
|Day|Month|Year|Season|bad_wind|
+---+-----+----+------+--------+
|  2|    1|2017|Winter|       0|
|  3|    1|2017|Winter|       0|
|  4|    1|2017|Winter|       0|
+---+-----+----+------+--------+
only showing top 3 rows



2135

In [None]:
bad_windperyear=bad_wind_days.groupBy('year','season').agg(F.sum('bad_wind').alias("sum_badwind_days")).orderBy('year','season', ascending=True)
bad_windperyear.show()
bad_windperyear.count()

+----+------+----------------+
|year|season|sum_badwind_days|
+----+------+----------------+
|2017|Summer|               4|
|2017|Winter|               4|
|2018|Summer|              26|
|2018|Winter|              21|
|2019|Summer|              16|
|2019|Winter|              21|
|2020|Summer|              19|
|2020|Winter|               8|
|2021|Summer|              44|
|2021|Winter|              17|
|2022|Summer|              18|
|2022|Winter|              19|
|2023|Winter|               1|
+----+------+----------------+



13

In [None]:
#create funtion to map between selected weather column in year and season to the final crop
#better than doing it by hand for each single column
def map_values(test):
    def inner_map(year, season):
        return test.get((year, season), None)
    return F.udf(inner_map)


In [None]:
#defining the vales in aggregated df
test_values = {(row['year'], row['season']): row['sum_badwind_days'] for row in bad_windperyear.collect()}

#applying the funtion
mapping = map_values(test_values)

#adding new column in crop
crop_worstcase = crop.withColumn("harmful_winds", mapping(F.col("year"), F.col("season")))
crop_worstcase.show()



+---------+------+----+--------+-----+---------+-------------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|
+---------+------+----+--------+-----+---------+-------------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|            4|
|Ghor Safi|Winter|2018|120710.8|  0.0|  18922.2|           21|
|Ghor Safi|Winter|2019| 98611.6|488.0|  13983.8|           21|
|Ghor Safi|Winter|2020| 81196.5|  0.0|  26088.8|            8|
|Ghor Safi|Winter|2021|119528.9|  0.0|  15484.4|           17|
+---------+------+----+--------+-----+---------+-------------+



In [None]:
#temporary df to hold extreme temp values, these values could affect crop production negatively
temp2 = temp.withColumn("extrm_temps",
   F.when((F.col("AirTemperature") <= 12) | (F.col("AirTemperature") > 32), 1)
    .otherwise(0)
)

temp2.show(3)
temp2.count()


+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|bad_wind|extrm_temps|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|       0|          1|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|       0|          0|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017|    1|  3|Winter|       0|          1|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+---

6852

In [None]:
#grouping by day
extrm_temps_days=temp2.groupBy('day',"month","year",'season').agg(F.max('extrm_temps').alias('extrm_temps')).orderBy('year','month','day','season', ascending=True)
extrm_temps_days.show(3)
extrm_temps_days.count()

+---+-----+----+------+-----------+
|day|month|year|season|extrm_temps|
+---+-----+----+------+-----------+
|  2|    1|2017|Winter|          1|
|  3|    1|2017|Winter|          1|
|  4|    1|2017|Winter|          0|
+---+-----+----+------+-----------+
only showing top 3 rows



2135

In [None]:
#grouping by year to match crop df
extrm_temps_years=extrm_temps_days.groupBy('year','season').agg(F.sum('extrm_temps').alias("sum_extrm_temps")).orderBy('Year', 'season',ascending=True)
extrm_temps_years.show()
extrm_temps_years.count()

+----+------+---------------+
|year|season|sum_extrm_temps|
+----+------+---------------+
|2017|Summer|            156|
|2017|Winter|             42|
|2018|Summer|            152|
|2018|Winter|             56|
|2019|Summer|            134|
|2019|Winter|             58|
|2020|Summer|            115|
|2020|Winter|             40|
|2021|Summer|            152|
|2021|Winter|             31|
|2022|Summer|            137|
|2022|Winter|             42|
|2023|Winter|              0|
+----+------+---------------+



13

In [None]:
temp_values = {(row['year'], row['season']): row['sum_extrm_temps'] for row in extrm_temps_years.collect()}

In [None]:
mapping = map_values(temp_values)
#adding new column in crop_worstacase
crop_worstcase = crop_worstcase.withColumn("extreme_temperatures", mapping(F.col("year"), F.col("season")))
crop_worstcase.show()

+---------+------+----+--------+-----+---------+-------------+--------------------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|extreme_temperatures|
+---------+------+----+--------+-----+---------+-------------+--------------------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|                 156|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|                 152|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|                 134|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|                 115|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|                 152|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|            4|                  42|
|Ghor Safi|Winter|2018|120710.8|  0.0|  18922.2|           21|                  56|
|Ghor Safi|Winter|2019| 98611.6|488.0|  13983.8|           21|                  58|
|Ghor Safi|Winter|2020| 81196.5|  0.0|  26088.8|            8|              

In [None]:
#temporary df to hold cloud cover values over 40%, these values could affect crop production negatively
temp3 = temp2.withColumn("clouds",
   F.when(F.col("CloudsCover%") > 30, 1)
    .otherwise(0)
)

temp3.show(3)
temp3.count()


+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|bad_wind|extrm_temps|clouds|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|       0|          1|     1|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|       0|          0|     0|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017|    1|  3|Winter|       0|          1|     0|
+------------+-------------------+--------------+------------+--------

6852

In [None]:
bad_cloud_days=temp3.groupBy('day',"month","year",'season').agg(F.max('clouds').alias('clouds')).orderBy('year','month','day','season', ascending=True)
bad_cloud_days.show(3)
bad_cloud_days.count()

+---+-----+----+------+------+
|day|month|year|season|clouds|
+---+-----+----+------+------+
|  2|    1|2017|Winter|     1|
|  3|    1|2017|Winter|     0|
|  4|    1|2017|Winter|     0|
+---+-----+----+------+------+
only showing top 3 rows



2135

In [None]:
bad_cloudy_years=bad_cloud_days.groupBy('year','season').agg(F.sum('clouds').alias("sum_too_cloudy")).orderBy('Year','season', ascending=True)
bad_cloudy_years.show()
bad_cloudy_years.count()

+----+------+--------------+
|year|season|sum_too_cloudy|
+----+------+--------------+
|2017|Summer|            34|
|2017|Winter|            89|
|2018|Summer|            20|
|2018|Winter|            37|
|2019|Summer|             0|
|2019|Winter|             0|
|2020|Summer|             0|
|2020|Winter|             0|
|2021|Summer|             0|
|2021|Winter|             0|
|2022|Summer|             0|
|2022|Winter|             1|
|2023|Winter|             0|
+----+------+--------------+



13

In [None]:
cloud_vals={(row['year'], row['season']): row['sum_too_cloudy'] for row in bad_cloudy_years.collect()}

In [None]:
mapping = map_values(cloud_vals)
#adding new column in crop_worstacase
crop_worstcase = crop_worstcase.withColumn("too_cloudy", mapping(F.col("year"), F.col("season")))
crop_worstcase.show(30)
crop_worstcase.count()

+---------+------+----+--------+-----+---------+-------------+--------------------+----------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|
+---------+------+----+--------+-----+---------+-------------+--------------------+----------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|                 156|        34|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|                 152|        20|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|                 134|         0|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|                 115|         0|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|                 152|         0|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|            4|                  42|        89|
|Ghor Safi|Winter|2018|120710.8|  0.0|  18922.2|           21|                  56|        37|
|Ghor Safi|Winter|2019| 98611.6|488.0|  13983.8|  

10

In [None]:
temp4 = temp3.withColumn("harmful_wind_direction",
   F.when((F.col("WindDirection(Degrees)") >= 45) , 1)
    .otherwise(0)
)

temp4.show(3)
temp4.count()

+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+----------------------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|bad_wind|extrm_temps|clouds|harmful_wind_direction|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+----------------------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|       0|          1|     1|                     0|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|       0|          0|     0|                     0|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017| 

6852

In [None]:
wind_dir_days=temp4.groupBy('day',"month","year",'season').agg(F.max('harmful_wind_direction').alias('harmful_wind_direction')).orderBy('year','month','day','season', ascending=True)
wind_dir_days.show(3)
wind_dir_days.count()

+---+-----+----+------+----------------------+
|day|month|year|season|harmful_wind_direction|
+---+-----+----+------+----------------------+
|  2|    1|2017|Winter|                     0|
|  3|    1|2017|Winter|                     0|
|  4|    1|2017|Winter|                     0|
+---+-----+----+------+----------------------+
only showing top 3 rows



2135

In [None]:
wind_dir_years=wind_dir_days.groupBy('year','season').agg(F.sum('harmful_wind_direction').alias("sum_harmful_windDir")).orderBy('Year','season', ascending=True)
wind_dir_years.show()
wind_dir_years.count()

+----+------+-------------------+
|year|season|sum_harmful_windDir|
+----+------+-------------------+
|2017|Summer|                 98|
|2017|Winter|                 59|
|2018|Summer|                156|
|2018|Winter|                149|
|2019|Summer|                138|
|2019|Winter|                161|
|2020|Summer|                132|
|2020|Winter|                145|
|2021|Summer|                164|
|2021|Winter|                151|
|2022|Summer|                136|
|2022|Winter|                133|
|2023|Winter|                 22|
+----+------+-------------------+



13

In [None]:
dir_vals={(row['year'], row['season']): row['sum_harmful_windDir'] for row in wind_dir_years.collect()}

In [None]:
%memit
mapping = map_values(dir_vals)
#adding new column in crop_worstacase
crop_worstcase = crop_worstcase.withColumn("wind_dir", mapping(F.col("year"), F.col("season")))
crop_worstcase.show(10)
crop_worstcase.count()

+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|
+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|                 156|        34|      98|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|                 152|        20|     156|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|                 134|         0|     138|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|                 115|         0|     132|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|                 152|         0|     164|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|            4|                  42|        89|      59|
|Ghor Safi|Winter|2018|120710.8|  0.0|  18922.2|           21|  

10

In [None]:
temp4.show()

+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+----------------------+
|      Region|          Date/Time|AirTemperature|CloudsCover%|WindDirection(Degrees)|WindSpeed|WindType|Year|Month|Day|Season|bad_wind|extrm_temps|clouds|harmful_wind_direction|
+------------+-------------------+--------------+------------+----------------------+---------+--------+----+-----+---+------+--------+-----------+------+----------------------+
|Ghor El Safi|2017-01-02 06:00:00|            10|          38|                     0|        0|    calm|2017|    1|  2|Winter|       0|          1|     1|                     0|
|Ghor El Safi|2017-01-02 12:00:00|            20|          13|                     0|        0|    calm|2017|    1|  2|Winter|       0|          0|     0|                     0|
|Ghor El Safi|2017-01-03 06:00:00|            11|           0|                     0|        0|    calm|2017| 

In [None]:
for_pred_day=temp4.groupBy('day',"month","year",'season').agg(F.max('bad_wind'), F.max('extrm_temps'), F.max('clouds'), F.max('harmful_wind_direction')).orderBy('Year','day','month', ascending=True)
for_pred_day.show(3)
for_pred_day.count()

+---+-----+----+------+-------------+----------------+-----------+---------------------------+
|day|month|year|season|max(bad_wind)|max(extrm_temps)|max(clouds)|max(harmful_wind_direction)|
+---+-----+----+------+-------------+----------------+-----------+---------------------------+
|  1|    2|2017|Winter|            0|               0|          1|                          0|
|  1|    3|2017|Winter|            0|               0|          1|                          0|
|  1|    4|2017|Summer|            0|               0|          1|                          1|
+---+-----+----+------+-------------+----------------+-----------+---------------------------+
only showing top 3 rows



2135

In [None]:
for_pred=for_pred_day.groupBy("year",'season').agg(F.sum('max(bad_wind)').alias('harmful_winds'), F.sum('max(extrm_temps)').alias('extreme_temperatures'), F.sum('max(clouds)').alias('too_cloudy'), F.sum('max(harmful_wind_direction)').alias('wind_dir')).orderBy('year','season', ascending=True)
for_pred.show(3)
for_pred.count()

+----+------+-------------+--------------------+----------+--------+
|year|season|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|
+----+------+-------------+--------------------+----------+--------+
|2017|Summer|            4|                 156|        34|      98|
|2017|Winter|            4|                  42|        89|      59|
|2018|Summer|           26|                 152|        20|     156|
+----+------+-------------+--------------------+----------+--------+
only showing top 3 rows



13

In [None]:
pred=for_pred.filter(for_pred['year']==2022)
pred.show()

+----+------+-------------+--------------------+----------+--------+
|year|season|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|
+----+------+-------------+--------------------+----------+--------+
|2022|Summer|           18|                 137|         0|     136|
|2022|Winter|           19|                  42|         1|     133|
+----+------+-------------+--------------------+----------+--------+



In [None]:
pred=pred.withColumn('harmful_winds', pred['harmful_winds'].cast('integer'))

In [None]:
pred=pred.withColumn('extreme_temperatures', pred['extreme_temperatures'].cast('integer'))

In [None]:
pred=pred.withColumn('too_cloudy', pred['too_cloudy'].cast('integer'))

In [None]:
pred=pred.withColumn('wind_dir', pred['wind_dir'].cast('integer'))

In [None]:
# pred index and encode
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

season_indexer = StringIndexer(inputCol="season", outputCol="seasonIndex")
indexed_df = season_indexer.fit(pred).transform(pred)
season_encoder = OneHotEncoder(inputCol="seasonIndex", outputCol="seasonVec")
encoded_df = season_encoder.fit(indexed_df).transform(indexed_df)

assembler = VectorAssembler(inputCols=["year", "seasonVec", "extreme_temperatures", "harmful_winds", "too_cloudy", "wind_dir"],
                            outputCol="features")
assembled_df = assembler.transform(encoded_df)
assembled_df.show(truncate=False)

+----+------+-------------+--------------------+----------+--------+-----------+-------------+---------------------------------+
|year|season|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|seasonIndex|seasonVec    |features                         |
+----+------+-------------+--------------------+----------+--------+-----------+-------------+---------------------------------+
|2022|Summer|18           |137                 |0         |136     |0.0        |(1,[0],[1.0])|[2022.0,1.0,137.0,18.0,0.0,136.0]|
|2022|Winter|19           |42                  |1         |133     |1.0        |(1,[],[])    |[2022.0,0.0,42.0,19.0,1.0,133.0] |
+----+------+-------------+--------------------+----------+--------+-----------+-------------+---------------------------------+



In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(assembled_df)
scaled_pred = scaler_model.transform(assembled_df)
scaled_pred.show()

+----+------+-------------+--------------------+----------+--------+-----------+-------------+--------------------+--------------------+
|year|season|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|seasonIndex|    seasonVec|            features|      scaledFeatures|
+----+------+-------------+--------------------+----------+--------+-----------+-------------+--------------------+--------------------+
|2022|Summer|           18|                 137|         0|     136|        0.0|(1,[0],[1.0])|[2022.0,1.0,137.0...|[0.0,1.4142135623...|
|2022|Winter|           19|                  42|         1|     133|        1.0|    (1,[],[])|[2022.0,0.0,42.0,...|[0.0,0.0,0.625231...|
+----+------+-------------+--------------------+----------+--------+-----------+-------------+--------------------+--------------------+



In [None]:
df = crop_worstcase.withColumn('harmful_winds', crop_worstcase['harmful_winds'].cast('integer'))


In [None]:
df =df.withColumn('wind_dir', df['wind_dir'].cast('integer'))


In [None]:
df = df.withColumn('extreme_temperatures', df['extreme_temperatures'].cast('integer'))


In [None]:
df = df.withColumn('too_cloudy', df['too_cloudy'].cast('integer'))

In [None]:
df.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Tomatoes: double (nullable = true)
 |-- Okra: double (nullable = true)
 |-- Onion dry: double (nullable = true)
 |-- harmful_winds: integer (nullable = true)
 |-- extreme_temperatures: integer (nullable = true)
 |-- too_cloudy: integer (nullable = true)
 |-- wind_dir: integer (nullable = true)



# Prep Data for modelling

In [None]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer

stringIndexer = StringIndexer(inputCol="Season", outputCol="SeasonIndex")
indexed = stringIndexer.fit(df).transform(df)

indexed.show()

+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+-----------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|
+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+-----------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|                 156|        34|      98|        0.0|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|                 152|        20|     156|        0.0|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|                 134|         0|     138|        0.0|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|                 115|         0|     132|        0.0|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|                 152|         0|     164|        0.0|
|Ghor Safi|Winter|2017| 75470.9|  0.0|  12819.2|            4|          

In [None]:
oneHotEncoder = OneHotEncoder(inputCol="SeasonIndex", outputCol="season_numeric")
encoded = oneHotEncoder.fit(indexed).transform(indexed)

encoded.show()

+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+-----------+--------------+
|   Region|Season|Year|Tomatoes| Okra|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|
+---------+------+----+--------+-----+---------+-------------+--------------------+----------+--------+-----------+--------------+
|Ghor Safi|Summer|2017| 58939.6|360.2|    767.5|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2018| 45367.5|403.8|   9620.6|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2019| 17317.6|146.3|    565.7|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2020| 25336.8|184.8|   1308.7|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2021| 28635.8| 78.7|   1515.5|           44|                 152|

In [None]:
okra_df=encoded.drop('Tomatoes','Onion dry')


In [None]:
onion_df=encoded.drop('Tomatoes','Okra')


In [None]:
tomato_df=encoded.drop('Okra','Onion dry')
tomato_df.show()

+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+
|   Region|Season|Year|Tomatoes|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|
+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+
|Ghor Safi|Summer|2017| 58939.6|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2018| 45367.5|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2019| 17317.6|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2020| 25336.8|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2021| 28635.8|           44|                 152|         0|     164|        0.0| (1,[0],[1.0])|
|Ghor Safi|Winter|2017| 75470.9|            4|                  42|        89|  

# Linear Reg Modelling

### Tomatoes

In [None]:
tomato_features=['Year','season_numeric','extreme_temperatures', 'wind_dir','harmful_winds' , 'too_cloudy']

In [None]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(inputCols=tomato_features, outputCol='features')
vector_df = assembler.transform(tomato_df)

vector_df.show()

+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|   Region|Season|Year|Tomatoes|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|
+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|Ghor Safi|Summer|2017| 58939.6|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|
|Ghor Safi|Summer|2018| 45367.5|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|
|Ghor Safi|Summer|2019| 17317.6|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|
|Ghor Safi|Summer|2020| 25336.8|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|[2020.0,1.0,115.0...|
|Ghor Safi|Summer|2021| 28635.8|           44|  

In [None]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(vector_df)
scaled_tomato = scaler_model.transform(vector_df)

In [None]:
scaled_tomato.show()

+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|   Region|Season|Year|Tomatoes|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|      scaledFeatures|
+---------+------+----+--------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|Ghor Safi|Summer|2017| 58939.6|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|[1353.04473318512...|
|Ghor Safi|Summer|2018| 45367.5|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|[1353.71555357837...|
|Ghor Safi|Summer|2019| 17317.6|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|[1354.38637397162...|
|Ghor Safi|Summer|2020| 25336.8|           19|            

In [None]:
final_data = scaled_tomato.select('scaledFeatures','Tomatoes')
train_ratio=0.7
test_ratio=0.3
trainset, testset = final_data.randomSplit([train_ratio, test_ratio], seed=42)
trainset.show()

+--------------------+--------+
|      scaledFeatures|Tomatoes|
+--------------------+--------+
|[1353.04473318512...| 75470.9|
|[1353.04473318512...| 58939.6|
|[1353.71555357837...| 45367.5|
|[1354.38637397162...| 98611.6|
|[1354.38637397162...| 17317.6|
|[1355.05719436487...| 25336.8|
+--------------------+--------+



In [None]:
testset.show()

+--------------------+--------+
|      scaledFeatures|Tomatoes|
+--------------------+--------+
|[1353.71555357837...|120710.8|
|[1355.05719436487...| 81196.5|
|[1355.72801475812...|119528.9|
|[1355.72801475812...| 28635.8|
+--------------------+--------+



In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
%%time
%memit
model_tomato=LinearRegression(featuresCol='scaledFeatures', labelCol='Tomatoes', regParam=0.1, elasticNetParam=0.5 )
model_tomato=model_tomato.fit(trainset)

CPU times: user 28.7 ms, sys: 2.75 ms, total: 31.4 ms
Wall time: 1.08 s


In [None]:
print(model_tomato.coefficients)
print(model_tomato.intercept)

[-18331.55096068053,-49635.21794961702,21251.8430700541,-5403.468296820341,4767.440712991269,-14598.390105477047]
24918243.93160951


In [None]:
trainingSummary = model_tomato.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 8459.768543
r2: 0.908944


In [None]:
result=model_tomato.evaluate(testset)

In [None]:
result.r2

0.19625092615639506

In [None]:
result.rootMeanSquaredError

33645.01508672753

In [None]:
pred_2022=scaled_pred.select('scaledFeatures')
pred_2022.show()

+--------------------+
|      scaledFeatures|
+--------------------+
|[0.0,1.4142135623...|
|[0.0,0.0,0.625231...|
+--------------------+



In [None]:
predictions=model_tomato.transform(pred_2022)

In [None]:
predictions.show()

+--------------------+--------------------+
|      scaledFeatures|          prediction|
+--------------------+--------------------+
|[0.0,1.4142135623...| 2.381792364331402E7|
|[0.0,0.0,0.625231...|2.3877809559455503E7|
+--------------------+--------------------+



### Okra

In [None]:
okra_df.show()

+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+
|   Region|Season|Year| Okra|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|
+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+
|Ghor Safi|Summer|2017|360.2|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2018|403.8|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2019|146.3|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2020|184.8|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2021| 78.7|           44|                 152|         0|     164|        0.0| (1,[0],[1.0])|
|Ghor Safi|Winter|2017|  0.0|            4|                  42|        89|      59|        1.0|     (1,

In [None]:
okra_features=['Year','season_numeric','extreme_temperatures', 'wind_dir','harmful_winds' , 'too_cloudy']
assembler = VectorAssembler(inputCols=okra_features, outputCol='features')
vector_df = assembler.transform(okra_df)
vector_df.show()

+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|   Region|Season|Year| Okra|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|
+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|Ghor Safi|Summer|2017|360.2|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|
|Ghor Safi|Summer|2018|403.8|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|
|Ghor Safi|Summer|2019|146.3|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|
|Ghor Safi|Summer|2020|184.8|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|[2020.0,1.0,115.0...|
|Ghor Safi|Summer|2021| 78.7|           44|                 152|     

In [None]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(vector_df)
scaled_okra= scaler_model.transform(vector_df)

In [None]:
scaled_okra.show()

+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|   Region|Season|Year| Okra|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|      scaledFeatures|
+---------+------+----+-----+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|Ghor Safi|Summer|2017|360.2|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|[1353.04473318512...|
|Ghor Safi|Summer|2018|403.8|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|[1353.71555357837...|
|Ghor Safi|Summer|2019|146.3|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|[1354.38637397162...|
|Ghor Safi|Summer|2020|184.8|           19|                 115|         0| 

In [None]:
final_data = scaled_okra.select( 'scaledFeatures','Okra')
trainset, testset = final_data.randomSplit([0.7,0.3])
final_data.show()


+--------------------+-----+
|      scaledFeatures| Okra|
+--------------------+-----+
|[1353.04473318512...|360.2|
|[1353.71555357837...|403.8|
|[1354.38637397162...|146.3|
|[1355.05719436487...|184.8|
|[1355.72801475812...| 78.7|
|[1353.04473318512...|  0.0|
|[1353.71555357837...|  0.0|
|[1354.38637397162...|488.0|
|[1355.05719436487...|  0.0|
|[1355.72801475812...|  0.0|
+--------------------+-----+



In [None]:
%%time
%memit
model_okra = LinearRegression(featuresCol='scaledFeatures', labelCol='Okra', regParam=1.0)
model_okra = model_okra.fit(trainset)




CPU times: user 18.7 ms, sys: 0 ns, total: 18.7 ms
Wall time: 777 ms


In [None]:
print("Coefficients: " + str(model_okra.coefficients))
print("Intercept: " + str(model_okra.intercept))

Coefficients: [-366.4457737930104,-471.58753472771417,332.1222927937845,-380.3988279742436,234.17635252667722,-613.8375359840036]
Intercept: 497835.4801719575


In [None]:
trainingSummary = model_okra.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 32.757991
r2: 0.968365


In [None]:
result=model_okra.evaluate(testset)

In [None]:
result.r2

-0.8815386068369038

In [None]:
result.rootMeanSquaredError

228.0816631880606

In [None]:
predictions=model_okra.transform(pred_2022)

In [None]:
predictions.show()

+--------------------+------------------+
|      scaledFeatures|        prediction|
+--------------------+------------------+
|[0.0,1.4142135623...| 448808.7791340797|
|[0.0,0.0,0.625231...|449667.32033128286|
+--------------------+------------------+



# Onions

In [None]:
onion_df.show()

+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+
|   Region|Season|Year|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|
+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+
|Ghor Safi|Summer|2017|    767.5|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2018|   9620.6|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2019|    565.7|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2020|   1308.7|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|
|Ghor Safi|Summer|2021|   1515.5|           44|                 152|         0|     164|        0.0| (1,[0],[1.0])|
|Ghor Safi|Winter|2017|  12819.2|            4|                  42|    

In [None]:
onion_features=['Year','season_numeric','extreme_temperatures', 'wind_dir','harmful_winds' , 'too_cloudy']
assembler = VectorAssembler(inputCols=okra_features, outputCol='features')
vector_df = assembler.transform(onion_df)
vector_df.show()

+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|   Region|Season|Year|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|
+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+
|Ghor Safi|Summer|2017|    767.5|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|
|Ghor Safi|Summer|2018|   9620.6|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|
|Ghor Safi|Summer|2019|    565.7|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|
|Ghor Safi|Summer|2020|   1308.7|           19|                 115|         0|     132|        0.0| (1,[0],[1.0])|[2020.0,1.0,115.0...|
|Ghor Safi|Summer|2021|   1515.5|        

In [None]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(vector_df)
scaled_onion= scaler_model.transform(vector_df)
scaled_onion.show()

+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|   Region|Season|Year|Onion dry|harmful_winds|extreme_temperatures|too_cloudy|wind_dir|SeasonIndex|season_numeric|            features|      scaledFeatures|
+---------+------+----+---------+-------------+--------------------+----------+--------+-----------+--------------+--------------------+--------------------+
|Ghor Safi|Summer|2017|    767.5|            4|                 156|        34|      98|        0.0| (1,[0],[1.0])|[2017.0,1.0,156.0...|[1353.04473318512...|
|Ghor Safi|Summer|2018|   9620.6|           26|                 152|        20|     156|        0.0| (1,[0],[1.0])|[2018.0,1.0,152.0...|[1353.71555357837...|
|Ghor Safi|Summer|2019|    565.7|           16|                 134|         0|     138|        0.0| (1,[0],[1.0])|[2019.0,1.0,134.0...|[1354.38637397162...|
|Ghor Safi|Summer|2020|   1308.7|           19|     

In [None]:
final_data = scaled_onion.select( 'scaledFeatures','Onion dry')
trainset, testset = final_data.randomSplit([0.7,0.3])
final_data.show()

+--------------------+---------+
|      scaledFeatures|Onion dry|
+--------------------+---------+
|[1353.04473318512...|    767.5|
|[1353.71555357837...|   9620.6|
|[1354.38637397162...|    565.7|
|[1355.05719436487...|   1308.7|
|[1355.72801475812...|   1515.5|
|[1353.04473318512...|  12819.2|
|[1353.71555357837...|  18922.2|
|[1354.38637397162...|  13983.8|
|[1355.05719436487...|  26088.8|
|[1355.72801475812...|  15484.4|
+--------------------+---------+



In [None]:
%%time
%memit
model_onion = LinearRegression(featuresCol='scaledFeatures', labelCol='Onion dry')
model_onion = model_onion.fit(trainset)


peak memory: 204.28 MiB, increment: 0.03 MiB
CPU times: user 171 ms, sys: 15.3 ms, total: 187 ms
Wall time: 1.58 s


In [None]:
print(model_onion.coefficients)
print(model_onion.intercept)

[9469.36480344047,-14634.31348206971,15655.858117899686,14724.211796816364,-13876.339074706038,13857.951009422703]
-12876250.850399917


In [None]:
trainingSummary = model_onion.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 2268.416023
r2: 0.925042


In [None]:
result=model_onion.evaluate(testset)

In [None]:
result.r2

0.43933205730414826

In [None]:
result.rootMeanSquaredError

6796.916640940626

In [None]:
predictions=model_onion.transform(pred_2022)

In [None]:
%%time
%memit
predictions.show()

+--------------------+--------------------+
|      scaledFeatures|          prediction|
+--------------------+--------------------+
|[0.0,1.4142135623...|-1.160175309326257E7|
|[0.0,0.0,0.625231...|-1.16215968047025...|
+--------------------+--------------------+

CPU times: user 8.65 ms, sys: 971 µs, total: 9.62 ms
Wall time: 864 ms


In [None]:
spark.stop()