In [0]:
number_of_epochs=10   # Set here number of epochs you want for RNN and LSTM

feature_to_predict='T2M' # Set here abbreviation of feature you want to predict
# Details of features are described below:
	# Temperature at 2 Meters (C) (T2M)
	# Dew/Frost Point at 2 Meters (C) (T2MDEW)
	# Wet Bulb Temperature at 2 Meters (C) (T2MWET)
	# Specific Humidity at 2 Meters (g/kg) (QV2M)
	# Precipitation Corrected (mm/day) (PRECTOTCORR)
	# Surface Pressure (kPa) (PS)
	# Wind Speed at 10 Meters (m/s) (WS10M)
	# Wind Direction at 10 Meters (Degrees) (WD10M)
	# Wind Speed at 50 Meters (m/s) (WS50M)
	# Wind Direction at 50 Meters (Degrees) (WD50M)
	# Relative Humidity at 2 Meters (%) (RH2M)
	# Earth Skin Temperature (C) (TS)

batchSize=50 # Set here batch size neural networks need to process
learningRate=0.01 # Set here learning rate
hiddenLayerDimension=10 # Set here dimension of hidden layer of neural networks

In [0]:
!pip install sklearn

In [0]:
from pyspark.ml.feature import MinMaxScaler,VectorAssembler
from pyspark.sql.functions import udf,col
from pyspark.sql.types import DoubleType
import numpy as np
#from sklearn.metrics import r2_score
#from sklearn.preprocessing import MinMaxScaler
#import matplotlib.plot as plt

In [0]:
# Driver Code

# Loading the data
df=spark.read.option("header","true").option('inferSchema','true').csv('dbfs:/FileStore/tables/dataset.csv')

# Select feature to predict
df1=df.select(feature_to_predict)

# Normalizing the data
assembler=VectorAssembler().setInputCols([feature_to_predict]).setOutputCol('featurevector')
assembled=assembler.transform(df1)

scaler=MinMaxScaler().setInputCol('featurevector')
scalerModel=scaler.fit(assembled)
scalerModel.setOutputCol('featurescaled')
scaled=scalerModel.transform(assembled)

# Extracting values from dense vectors
extractor_udf=udf(lambda x:float(sum(x)),DoubleType())
transformed=scaled.withColumn('feature',extractor_udf(col('featurescaled')))

# Splitting the dataset into training and testing dataset
train, test = transformed.randomSplit([0.8,0.2],seed=500)

# Splitting training and testing data into features and label to make it comfortable for supervised learning
batchSize=50
X_train_temp=[]
y_train_temp=[]
X_test_temp=[]
y_test_temp=[]
train_collect=train.rdd.map(lambda x:x['feature']).collect()
test_collect=test.rdd.map(lambda x:x['feature']).collect()

for i,j in zip(range(batchSize,len(train_collect)),range(batchSize,len(test_collect))):
    X_train_temp.append(train_collect[i-batchSize:i])
    y_train_temp.append(train_collect[i])
    X_test_temp.append(test_collect[j-batchSize:j])
    y_test_temp.append(test_collect[j])

X_train=np.array(X_train_temp)
y_train=np.array(y_train_temp)
X_test=np.array(X_test_temp)
y_test=np.array(y_test_temp)

In [0]:
X_train[-1]

Out[5]: array([0.46887084, 0.46887084, 0.46906419, 0.46906419, 0.46925754,
       0.46925754, 0.46925754, 0.46925754, 0.46925754, 0.46925754,
       0.46925754, 0.46925754, 0.46945089, 0.46964424, 0.46964424,
       0.46964424, 0.46983759, 0.46983759, 0.46983759, 0.46983759,
       0.46983759, 0.46983759, 0.46983759, 0.46983759, 0.46983759,
       0.47003094, 0.47003094, 0.47003094, 0.47022428, 0.47022428,
       0.47022428, 0.47022428, 0.47022428, 0.47022428, 0.47022428,
       0.47041763, 0.47061098, 0.47061098, 0.47061098, 0.47061098,
       0.47061098, 0.47080433, 0.47080433, 0.47080433, 0.47080433,
       0.47080433, 0.47099768, 0.47099768, 0.47099768, 0.47119103])

In [0]:
# Initialization of RNN
rnn = RNN(X_train, y_train, hiddenLayerDimension, learningRate, batchSize)

In [0]:
# Training of RNN
sklearn_scaler=MinMaxScaler(feature_range = (0, 1))

for epoch in range(number_of_epochs):
    first_row = 0 
    total_training_error = 0
    prediction_for_training_temp=numpy.array([])
                                    
    for last_row in range(batchSize, X_train.shape[0], batchSize):
        prediction_for_training_temp=numpy.append(prediction_for_training_temp,rnn.forward(X_train[first_row : last_row])[1:])
        total_training_error  += rnn.backward(y_train[first_row : last_row], X_train[first_row : last_row])
        first_row = last_row

    prediction_for_training = (sklearn_scaler.inverse_transform(prediction_for_training_temp))[:,0]             
    gold_label_y_train = sklearn_scaler.inverse_transform(y_train)
    r2=r2_score(gold_label_y_train,prediction_for_training)
    print("R2 score for training for epoch {}:{}".format(epoch+1,r2))

In [0]:
# Testing of RNN

prediction_for_testing_temp, testing_error = rnn.predict(X_test, y_test)
prediction_for_testing = (sklearn_scaler.inverse_transform(prediction_for_testing_temp[1:]))[:,0]                                            
gold_label_y_test = sklearn_scaler.inverse_transform(y_test)
r2=r2_score(gold_label_y_test,prediction_for_testing)
print("R2 score for testing:{}".format(r2))

In [0]:
# Analysis

x_data=df.rdd.map(lambda x:str(x['YEAR'])+"-"+str(x['MO'])+"-"+str(x['DY'])).collect()
temp_list=df.rdd.map(lambda x:x[feature_to_predict]).collect()
train_data=temp_list[:train.count()]
test_data=temp_list[:test.count()]

plt.plot(x_data,train_data,label="Training Data",color="b")
plt.plot(x_data,test_data,label="Testing Data",color="g")
plt.plot(x_data,prediction_for_testing,label="Predicted Data",color="r")

plt.xlabel("Days")
plt.ylabel(feature_to_predict)
plt.title("Analysis")

plt.legend()
plt.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-518356263705371>:4[0m
[1;32m      1[0m [38;5;66;03m# Analysis[39;00m
[1;32m      3[0m analysis_df_collect[38;5;241m=[39mdf[38;5;241m.[39mselect([[38;5;124m'[39m[38;5;124mYEAR[39m[38;5;124m'[39m,[38;5;124m'[39m[38;5;124mMO[39m[38;5;124m'[39m,[38;5;124m'[39m[38;5;124mDY[39m[38;5;124m'[39m,feature_to_predict])[38;5;241m.[39mrdd[38;5;241m.[39mmap([38;5;28;01mlambda[39;00m x:([38;5;28mstr[39m(x[[38;5;124m'[39m[38;5;124mYEAR[39m[38;5;124m'[39m])[38;5;241m+[39m[38;5;124m"[39m[38;5;124m-[39m[38;5;124m"[39m[38;5;241m+[39m[38;5;28mstr[39m(x[[38;5;124m'[39m[38;5;124mMO[39m[38;5;124m'[39m])[38;5;241m+[39m[38;5;124m"[39m[38;5;124m-[39m[38;5;124m"[39m[38;5;241m+[39m[38;5;28mstr[39m(x[[38;5;124m'[39m[38;5;124mDY[39m[38;5;124m'[39m]