# Spark data mining

In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import warnings
warnings.simplefilter(action='ignore')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('readin').getOrCreate()

In [None]:
# Let's read in the data. Note that it's in the csv

#City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
#define the schema

# Let's import in the relevant types.
warnings.filterwarnings('ignore')
from pyspark.sql.types import *
Schema=StructType([
  StructField("City",StringType(),nullable=True),
  StructField("Date",StringType(),nullable=True),
  StructField("PM25",FloatType(),nullable=True),
  StructField("PM10",FloatType(),nullable=True),
  StructField("NO",FloatType(),nullable=True),
  StructField("NO2",FloatType(),nullable=True),
  StructField("NOX",FloatType(),nullable=True),
  StructField("NH3",FloatType(),nullable=True),
  StructField("CO",FloatType(),nullable=True),
  StructField("SO2",FloatType(),nullable=True),
  StructField("O3",FloatType(),nullable=True),
  StructField("benzene",FloatType(),nullable=True),
  StructField("toluene",FloatType(),nullable=True),
  StructField("Xylene",FloatType(),nullable=True),
  StructField("AQI",FloatType(),nullable=True),
  StructField("AQIBucket",StringType(),nullable=True)
])
df = spark.read.option("header",True).schema(Schema).csv("Datasets/city_day.csv")

df.show()


## Data Exploration

In [107]:
# The show method allows you visualise DataFrames. We can see that there are two columns. 
df.show()

# You could also try this. 
df.columns

df.dtypes

df.describe().toPandas()


+---------+----------+----+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+
|     City|      Date|PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|
+---------+----------+----+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+
|Ahmedabad|2015-01-01|null|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     null|
|Ahmedabad|2015-01-02|null|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     null|
|Ahmedabad|2015-01-03|null|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     null|
|Ahmedabad|2015-01-04|null|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   1.0|null|     null|
|Ahmedabad|2015-01-05|null|null|  22.1|21.42| 37.76|null|  22.1|39.33| 39.31|   7.01|  18.89|  2.78|null|     null|
|Ahmedabad|2015-01-06|null|null| 45.41|38.48|  81.5|null| 45.41|45.76| 4

                                                                                

Unnamed: 0,summary,City,Date,PM25,PM10,NO,NO2,NOX,NH3,CO,SO2,O3,benzene,toluene,Xylene,AQI,AQIBucket
0,count,29531,29531,24933.0,18391.0,25949.0,25946.0,25346.0,19203.0,27472.0,25677.0,25509.0,23908.0,21490.0,11422.0,24850.0,24850
1,mean,,,67.45057795305125,118.12710294174458,17.574729666828148,28.560659052670943,32.3091233306691,23.483476022437166,2.248598209145695,14.531977259530647,34.49143047007627,3.280840303665094,8.700972087396305,3.0701278206492613,166.4635814889336,
2,stddev,,,64.66144940408857,90.60510976469617,22.785846340154315,24.47474577146909,31.64601095123696,25.68427498752751,6.962884271836143,18.13377486834342,21.694928182946093,15.811136401685069,19.96916366625716,6.323247403424576,140.69658509414992,
3,min,Ahmedabad,2015-01-01,0.04,0.01,0.02,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,13.0,Good
4,max,Visakhapatnam,2020-07-01,949.99,1000.0,390.68,362.21,467.63,352.89,175.81,193.86,257.73,455.03,454.85,170.37,2049.0,Very Poor


In [None]:
# We can use the describe method get some general statistics on our data too. Remember to show the DataFrame!
# But what about data type?
# Then create a variable with the correct structure.
df.describe().show()

In [None]:
# For type, we can use print schema. 
# But wait! What if you want to change the format of the data? Maybe change age to an integer instead of long?
# And now we can read in the data using that schema. If we print the schema, we can see that age is now an integer.
df.printSchema()

## Data Manipulation

In [None]:
df.describe().toPandas()

df.groupby('AQIBucket').count().show()


In [None]:

df.groupby('City').count().show()

In [None]:
df.count()
# Let's see the data. You'll notice nulls.
df.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
df.createOrReplaceTempView('pollution')

# After that, we can use the SQL programming language for queries. 
results = spark.sql("SELECT * FROM pollution")
results.show()

In [None]:
# After that, we can use the SQL programming language for queries. 
results1 = spark.sql("SELECT city, count(City) FROM pollution where AQI is null group by City")
results1.show()

In [None]:
# Find count for empty, None, Null, Nan with string literals.
from pyspark.sql.functions import col,isnan,when,count
df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.show()

In [None]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

# Exploring data with SQL

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

fig = plt.figure(figsize=(25,13))
st = fig.suptitle("Distribution of features",fontsize = 50, verticalalignment="center")
for col,num in zip(df.toPandas().describe().columns, range(1,11)):
    ax = fig.add_subplot(3,4, num)
    ax.hist(df.toPandas()[col])
    plt.grid(False)
    plt.xticks(rotation=45, fontsize=20)
    plt.yticks(fontsize=15)
    plt.title(col.upper(), fontsize=20)

plt.tight_layout()
st.set_y(0.95)
fig.subplots_adjust(top=0.85, hspace=0.4)
plt.show()

In [None]:
#Import all the required functions
from pyspark.sql.functions import year 
df.createOrReplaceTempView('pollution')

results1 = spark.sql("SELECT city, Count(AQIBucket), count(City) FROM pollution where AQIBucket is not null group by City, AQIBucket")
results1.show()


In [None]:
# Requires a certain amount of non-null values. Row two was dropped, as there's only one non-null value.
df.na.drop(thresh=8).show()

In [None]:
df.count()

In [None]:
import sys
###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import first,last

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(0, sys.maxsize)

# define the forward-filled column
filled_column = first(df['PM25'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = df.withColumn('PM25', filled_column)

# show off our glorious achievements
spark_df_filled.orderBy('City', 'Date').show(10) 
spark_df_filled.show()

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(-sys.maxsize,0)

# define the forward-filled column
filled_column = last(spark_df_filled['PM25'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = spark_df_filled.withColumn('PM25', filled_column)
# show off our glorious achievements

spark_df_filled.orderBy('City', 'Date').show(10) 
spark_df_filled.show()


In [None]:
spark_df_filled.filter(spark_df_filled.PM25.isNull()).show()

In [None]:
# Find count for empty, None, Null, Nan with string literals.
from pyspark.sql.functions import col,isnan,when,count
df2 = spark_df_filled
df2 = df2.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df2.columns])
df2.show()

In [None]:
import sys
###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import last, first

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(-sys.maxsize , 0)

# define the forward-filled column
filled_column = last(spark_df_filled['PM10'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = spark_df_filled.withColumn('PM10', filled_column)

# show off our glorious achievements
spark_df_filled.orderBy('City', 'Date').show(10) 
spark_df_filled.show()



# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(0, sys.maxsize)

# define the forward-filled column
filled_column= first(spark_df_filled['PM10'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = spark_df_filled.withColumn('PM10', filled_column)

# show off our glorious achievements
spark_df_filled.orderBy('City', 'Date').show(10) 
spark_df_filled.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.

dff= spark_df_filled
dff.createOrReplaceTempView('pollution')

results = spark.sql("SELECT * FROM pollution where pm10 is null")
results.show()

In [None]:
dff.filter(dff.PM10.isNull()).show()

In [None]:
from pyspark.sql.functions import mean
mean_pm10 = dff.select(mean(dff['PM10'])).collect()
mean_pm10[0][0]


In [None]:


dff1 = dff.na.fill(mean_pm10[0][0], subset=['PM10'])
dff1.filter(dff1.PM10.isNull()).show()

In [None]:
dff1.show()

In [None]:

#dff.filter(dff.PM25.isNull()).show()
#from pyspark.sql.functions import isnan, when, count, col

#dff.select([count(when(isNull(c), c)).alias(c) for c in dff.columns]).show()

# Find count for empty, None, Null, Nan with string literals.
from pyspark.sql.functions import col,isnan,when,count

dffs = dff1.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in dff.columns])
dffs.show()

In [None]:
# To Get Year from date or Time column
spark_df_filled = dff1
spark_df_filled1 = spark_df_filled.withColumn("year",year("Date"))


In [None]:
dff2= spark_df_filled1
dff2.show()



In [None]:
dff2.createOrReplaceTempView('pollution')

results = spark.sql("SELECT * FROM pollution where PM10 is null")
results.show()

In [None]:
dff2.count()
dff2.show()

In [None]:
#df1.show() dff5.show()

In [None]:
# Let's collect the average. You'll notice that the collection returns the average in an interesting format.
mean_no2 = dff2.select(mean(dff2['NO2'])).collect()
mean_so2 = dff2.select(mean(dff2['SO2'])).collect()
mean_o3 = dff2.select(mean(dff2['O3'])).collect()
#mean_pm10
mean_no2[0][0]
dff3 = dff2.na.fill(mean_no2[0][0], subset=['NO2'])
mean_so2[0][0]
dff4 = dff3.na.fill(mean_so2[0][0], subset=['SO2'])
mean_o3[0][0]
dff5 = dff4.na.fill(mean_o3[0][0], subset=['O3'])



In [None]:

#df1=df1.drop('date')
from pyspark.sql.functions import col,isnan,when,count

dff6 = dff5.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in dff5.columns])
dff6.show()


In [None]:

dff5.show()

In [None]:
# Let's collect the average. You'll notice that the collection returns the average in an interesting format.
#mean_pm10 = df.select(mean(df['PM10'])).collect()
#mean_pm10
#mean_pm10[0][0]
#df.filter(df.PM10.isNull()).show()
#df1 = df1.na.fill(mean_pm10[0][0], subset=['PM10'])
#df1.filter(df1.PM10.isNull()).show()

In [None]:

dftr=dff5.drop('NO')
dftr.show()

In [None]:


dftr=dftr.drop('NOX')
dftr=dftr.drop('NH3')
dftr=dftr.drop('CO')
dftr=dftr.drop('benzene')
dftr=dftr.drop('toluene')
dftr=dftr.drop('Xylene')

In [None]:
dftr.show()
#df1=df1.drop('date')
from pyspark.sql.functions import col,isnan,when,count

dftr1 = dftr.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in dftr.columns])
dftr1.show()

In [None]:
#df1 = df1.na.fill(mean_pm10[0][0], subset=['PM10'])
#dftr2.filter(dftr.AQI.isNull()).show()

In [None]:
#Replace 0 for null on only population column 
#df2 = df2.na.fill(value=0,subset=["AQI"]).show()


In [None]:
dftr.show()

In [None]:
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func
dftr.withColumn("AQIBucket1", func.last('AQIBucket', True).over(Window.partitionBy('City').orderBy('year').rowsBetween(-sys.maxsize, 0))).show()


In [None]:

###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import first

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(0, sys.maxsize)

# define the forward-filled column
filled_column = first(dftr['AQIBucket'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = dftr.withColumn('AQIBucket', filled_column)

# show off our glorious achievements
spark_df_filled.orderBy('City', 'Date').show(10) 

In [None]:
spark_df_filled.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled.createOrReplaceTempView('pollution')

results2 = spark.sql("SELECT * FROM pollution where AQIBucket is null")
results2.show()

In [None]:
###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import last

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(-sys.maxsize ,0 )

# define the forward-filled column
filled_column = last(spark_df_filled['AQIBucket'], ignorenulls=True).over(window)

# do the fill
spark_df_filled2 = spark_df_filled.withColumn('AQIBucket', filled_column)

# show off our glorious achievements
spark_df_filled2.orderBy('City', 'Date').show(10)

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled2.createOrReplaceTempView('pollution')

results3 = spark.sql("SELECT * FROM pollution where AQIBucket is null")
results3.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled2.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT AQIBucket, count(AQIBucket),city,year FROM pollution group by AQIbucket,city,year")
results4.show()

In [None]:
###check the AQI for the rows, and update the nulls

spark_df_filled2.filter(spark_df_filled2.AQI.isNull()).show()

In [None]:
###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import first

# define the window
window = Window.partitionBy('City')\
               .orderBy('AQIBucket')\
               .rowsBetween(0, sys.maxsize)

# define the forward-filled column
filled_column = first(spark_df_filled2['AQI'], ignorenulls=True).over(window)

# do the fill
spark_df_filled3 = spark_df_filled2.withColumn('AQII', filled_column)

# show off our glorious achievements
spark_df_filled3.orderBy('City', 'AQII').show(10) 

In [None]:

spark_df_filled3.filter(spark_df_filled3.AQII.isNull()).show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled3.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT count(AQIBucket),AQIBucket,city,year FROM pollution group by AQIbucket,city,year")
results4.show()

In [None]:
spark_df_filled3.show()

In [None]:
##transform with String Indexer and OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder


indexer = StringIndexer(inputCol="AQIBucket", outputCol="AQIB_index").fit(spark_df_filled3)
spark_df_ind = indexer.transform(spark_df_filled3)
spark_df_ind.show()


#df3 = encoded


In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_ind.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT AQIBucket ,AQIB_index, count(AQIB_index), city,year FROM pollution group by AQIBucket, AQIB_index, city,year")
results4.show()
result = spark.sql("Select * from pollution")
result.show()

In [None]:
###one hot encoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

encoder = OneHotEncoder(inputCol="AQIB_index", outputCol="AQIB_vec")
ohe = encoder.fit(spark_df_ind) # indexer is the existing dataframe, see the question
encoded = ohe.transform(spark_df_ind)
encoded.show()

In [None]:

spark_df_filled3.dtypes


In [None]:
catCols = ['AQIBucket']
numCols = ['PM25','PM10','NO2','SO2','O3']
numcolout = ['AQII']

In [None]:
print(numCols)
print(catCols)
print(numcolout)

In [None]:
spark_df_filled3.show()

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

va = VectorAssembler(inputCols=numCols, outputCol="SS_features")


In [None]:
temp_train = va.transform(spark_df_filled3)

temp_train.show(2, truncate=False)


In [None]:
temp_train.select("SS_features").show(5,truncate=False)

In [None]:
ss=StandardScaler(inputCol="SS_features", outputCol="Scaled")
train1 = ss.fit(temp_train).transform(temp_train)
train1.select("Scaled").show(5,truncate=False)

In [None]:
train1.show()

In [None]:
from pyspark.ml.feature import MinMaxScaler
mms = MinMaxScaler(inputCol="SS_features", outputCol="MMScaled")
train = mms.fit(temp_train).transform(temp_train)
train.select("MMScaled").show(5)

In [None]:
train.show(2, truncate=False)

# linear regression

In [None]:
from pyspark.ml.feature import VectorAssembler

catCols = ['AQIBucket']
numCols = ['PM25','PM10','NO2','SO2','O3']
numcolout = ['AQII']

# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=["PM25","NO2","SO2","O3"],
    outputCol="features")

# Now that we've created the assembler variable, let's actually transform the data.
output = assembler.transform(spark_df_filled3)


# Using print schema, you see that the features output column has been added. 
output.printSchema()

# You can see that the features column is a dense vector that combines the various features as expected.
output.head()

output.select(['features']).toPandas().head()


In [None]:
# Let's select two columns (the feature and predictor).
# This is now in the appropriate format to be processed by Spark.
final_data = output.select("features",'AQII')
final_data.show()

In [None]:
# Let's do a randomised 70/30 split. 
# Remember, you can use other splits depending on how easy/difficult it is to train your model.
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [None]:
# Let's see our training data.
train_data.describe().show()

# And our testing data.
test_data.describe().show()

In [None]:
#import LinearRegression library
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol='AQII')
# Fit the model to the data.
lrModel = lr.fit(train_data)

In [None]:
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

In [None]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)

In [None]:
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))

In [None]:
# We can also get the R2 value. 
print("R2: {}".format(test_results.r2))
final_data.describe().show()

In [None]:
spark_df_filled3.show()

In [None]:
##transform with String Indexer and OneHotEncoder

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder


indexer0 = StringIndexer(inputCol="City", outputCol="City_index").fit(spark_df_filled3)
spark_df = indexer0.transform(spark_df_filled3)
spark_df.show()

###one hot encoder

encoder0 = OneHotEncoder(inputCol="City_index", outputCol="City_vec")
ohe = encoder0.fit(spark_df) # indexer is the existing dataframe, see the question
encoded0 = ohe.transform(spark_df)
encoded0.show()





In [None]:
##transform with String Indexer and OneHotEncoder

indexer1 = StringIndexer(inputCol="AQIBucket", outputCol="AQIB_index").fit(encoded0)
spark_df_ind1 = indexer1.transform(encoded0)
spark_df_ind1.show()



In [None]:
###one hot encoder

encoder1 = OneHotEncoder(inputCol="AQIB_index", outputCol="AQIB_vec")
ohe1 = encoder1.fit(spark_df_ind1) # indexer is the existing dataframe, see the question
encoded1 = ohe1.transform(spark_df_ind1)
encoded1.show()

In [None]:
# Now we can assemble all of this as one vector in the features column. 
assembler = VectorAssembler(inputCols=['City_vec','PM25',
 'PM10',
 'NO2',
 'SO2',
 'O3'],outputCol='features')


from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
# Note that survived is a categorial variable but didn't require any transformation.
# That's because it's already in the format of 1's and 0's. 
log_reg = LogisticRegression(featuresCol='features',labelCol='AQIB_index')


In [None]:
# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline = Pipeline(stages=[indexer0,indexer1,
                           encoder0,encoder1,
                           assembler,log_reg ])
# Train/test split. 
train_data1, test_data1 = spark_df_filled3.randomSplit([0.7,.3])

# Note pipeline. Call it as you would call a machine learning object.
fit_model = pipeline.fit(train_data1)

# Transform test data. 
results = fit_model.transform(test_data1)

# Evaluate the model using the binary classifer.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='AQIB_index')

# If we select the actual and predicted results, we can see that some predictions were correct while others were wrong.
results.select('AQIB_index','prediction').show()



In [None]:
# We can then evaluate using AUC (area under the curve). AUC is linked to ROC.
AUC = my_eval.evaluate(results)

AUC

In [None]:
train1.show()

In [None]:
# Import the relevant Python libraries.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Convert sex to an array using Numpy and plot it using pyplot. 
sexArr = np.array(train1.select('AQIBucket').collect())
plt.hist(sexArr)
plt.show()


In [None]:
# Not all interpretation has to be visualisations. You can also gain a lot of information with text.
# For example, here we're seeing how much variance our model could account for.
# According to this, the model got 164/209 correct. 
totalResults = results.select('AQIB_index','prediction')


correctResults = totalResults.filter(totalResults['AQIB_INDEX'] == totalResults['prediction'])

countTR = totalResults.count()
print("Correct: " + str(countTR))

countTC = correctResults.count()
print("Total Correct: " + str(countTC)) 

In [None]:
spark_df_filled3.show()

In [None]:
# prepare the data
features = ["PM25","PM10","NO2","SO2","O3"]
lr_data = spark_df_filled3.select(col("AQII").alias("label"), *features)
lr_data.printSchema()

# split the dataset into training and test
(training, test) = lr_data.randomSplit([.7, .3], seed = 196)



In [None]:
# creating the pipeline
vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
lr = LinearRegression(maxIter=10, regParam=.01)

estimators = [vectorAssembler, standardScaler, lr]


# building the model using the pipeline
# https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.Pipeline
pipeline = Pipeline(stages=estimators)

model = pipeline.fit(training)


In [None]:
# prediction
prediction = model.transform(test)

prediction.show()



In [None]:
# evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error

rmse = eval.evaluate(prediction)
print("RMSE: %.3f" % rmse)

# Mean Square Error

mse = eval.evaluate(prediction, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error

mae = eval.evaluate(prediction, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination

r2 = eval.evaluate(prediction, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

In [None]:
spark_df_filled3.show()

In [None]:
assembler = VectorAssembler(inputCols=['PM25',
 'PM10',
 'NO2',
 'SO2',
 'O3','AQII'],outputCol="features")

# Let's transform the data. 
output = assembler.transform(spark_df_filled3)

# Let's select the two columns we want. Features (which contains vectors), and the predictor.
indexer = StringIndexer(inputCol="AQIBucket", outputCol="AQIB_index")
output_fixed = indexer.fit(output).transform(output)

In [None]:
output_fixed.show()

In [None]:
###AQIB_index

# Now we can assemble all of this as one vector in the features column. 


# Let's select the two columns we want. Features (which contains vectors), and the predictor.

final_data = output_fixed.select('features','AQIB_index')

# Split the training and testing set.
train_data,test_data = final_data.randomSplit([0.7,0.3])

# Let's import the relevant classifiers. 
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

# Use defaults to make the comparison "fair". This simplifies the comparison process.

dtc = DecisionTreeClassifier(labelCol='AQIB_index',featuresCol='features')
rfc = RandomForestClassifier(labelCol='AQIB_index',featuresCol='features')
gbt = GBTClassifier(labelCol='AQIB_index',featuresCol='features')

# Train the models (it's three models, so it might take some time).
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
#gbt_model = gbt.fit(train_data)

dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
#gbt_predictions = gbt_model.transform(test_data)

In [None]:
# Let's start off with binary classification.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Note that the label column isn't named label, it's named PrivateIndex in this case.
my_binary_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='AQIB_index')

# This is the area under the curve. This indicates that the data is highly seperable.
print("DTC")
print(my_binary_eval.evaluate(dtc_predictions))

# RFC improves accuracy but also model complexity. RFC outperforms DTC in nearly every situation.
print("RFC")
print(my_binary_eval.evaluate(rfc_predictions))


# kmeans unsupervised learning

In [None]:
import pandas as pd
from pyrasterframes import TileExploder
from pyrasterframes.rasterfunctions import *

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline



exploder = TileExploder()
assembler = VectorAssembler(inputCols=['PM25',
 'PM10',
 'NO2',
 'SO2',
 'O3','AQII'],outputCol="features")

kmeans = KMeans().setK(5).setFeaturesCol('features')

