# Spark DataFrame - Basics

Let's start off with the fundamentals of Spark DataFrame. 

Objective: In this exercise, you'll find out how to start a spark session, read in data, explore the data and manipuluate the data (using DataFrame syntax as well as SQL syntax). Let's get started! 

In [None]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import warnings
warnings.simplefilter(action='ignore')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('readin').getOrCreate()

In [None]:
# Let's read in the data. Note that it's in the csv

#City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
#define the schema

# Let's import in the relevant types.
warnings.filterwarnings('ignore')
from pyspark.sql.types import *
Schema=StructType([
  StructField("City",StringType(),nullable=True),
  StructField("Date",StringType(),nullable=True),
  StructField("PM25",FloatType(),nullable=True),
  StructField("PM10",FloatType(),nullable=True),
  StructField("NO",FloatType(),nullable=True),
  StructField("NO2",FloatType(),nullable=True),
  StructField("NOX",FloatType(),nullable=True),
  StructField("NH3",FloatType(),nullable=True),
  StructField("CO",FloatType(),nullable=True),
  StructField("SO2",FloatType(),nullable=True),
  StructField("O3",FloatType(),nullable=True),
  StructField("benzene",FloatType(),nullable=True),
  StructField("toluene",FloatType(),nullable=True),
  StructField("Xylene",FloatType(),nullable=True),
  StructField("AQI",FloatType(),nullable=True),
  StructField("AQIBucket",StringType(),nullable=True)
])
df = spark.read.option("header",True).schema(Schema).csv("Datasets/city_day.csv")

df.show()


## Data Exploration

In [None]:
# The show method allows you visualise DataFrames. We can see that there are two columns. 
df.show()

# You could also try this. 
df.columns

df.dtypes

df.describe().toPandas()


In [None]:
# We can use the describe method get some general statistics on our data too. Remember to show the DataFrame!
# But what about data type?
# Then create a variable with the correct structure.
df.describe().show()

In [None]:
# For type, we can use print schema. 
# But wait! What if you want to change the format of the data? Maybe change age to an integer instead of long?
# And now we can read in the data using that schema. If we print the schema, we can see that age is now an integer.
df.printSchema()

## Data Manipulation

In [None]:
df.describe().toPandas()

df.groupby('AQIBucket').count().show()


In [None]:

df.groupby('City').count().show()

In [None]:
df.count()
# Let's see the data. You'll notice nulls.
df.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
df.createOrReplaceTempView('pollution')

# After that, we can use the SQL programming language for queries. 
results = spark.sql("SELECT * FROM pollution")
results.show()

In [None]:
# After that, we can use the SQL programming language for queries. 
results1 = spark.sql("SELECT city, count(City) FROM pollution where AQI is null group by City")
results1.show()

# Exploring data with SQL

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

fig = plt.figure(figsize=(25,13))
st = fig.suptitle("Distribution of features",fontsize = 50, verticalalignment="center")
for col,num in zip(df.toPandas().describe().columns, range(1,11)):
    ax = fig.add_subplot(3,4, num)
    ax.hist(df.toPandas()[col])
    plt.grid(False)
    plt.xticks(rotation=45, fontsize=20)
    plt.yticks(fontsize=15)
    plt.title(col.upper(), fontsize=20)

plt.tight_layout()
st.set_y(0.95)
fig.subplots_adjust(top=0.85, hspace=0.4)
plt.show()

In [None]:
#Import all the required functions
from pyspark.sql.functions import year 
df.createOrReplaceTempView('pollution')

results1 = spark.sql("SELECT city, Count(AQIBucket), count(City) FROM pollution where AQIBucket is not null group by City, AQIBucket")
results1.show()


In [None]:
# Requires a certain amount of non-null values. Row two was dropped, as there's only one non-null value.
df.na.drop(thresh=8).show()

In [None]:
# Also, it's good practice to use your average to fill missing data. 
from pyspark.sql.functions import mean

# Let's collect the average. You'll notice that the collection returns the average in an interesting format.
mean_pm25 = df.select(mean(df['PM25'])).collect()
#mean_pm25
mean_pm25[0][0]

# And finally, fill the missing values with the mean.
#df.na.fill(mean_pm25, subset=['PM25']).show()
#df.na.fill(mean_pm25[0][0], subset=['PM25']).show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
df.createOrReplaceTempView('pollution')

results = spark.sql("SELECT * FROM pollution where pm25 is null")
results.show()

In [None]:
df.filter(df.PM25.isNull()).show()
df = df.na.fill(mean_pm25[0][0], subset=['PM25'])
df.filter(df.PM25.isNull()).show()

In [None]:
df.filter(df.PM25.isNull()).show()
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

In [None]:
# To Get Year from date or Time column
df = df.withColumn("year",year("Date"))


In [None]:
df.show()

In [None]:
df1=df

In [None]:
df1.show()

In [None]:
df1=df1.drop('date')

In [None]:
df1.show()

In [None]:
# Let's collect the average. You'll notice that the collection returns the average in an interesting format.
mean_pm10 = df.select(mean(df['PM10'])).collect()
#mean_pm10
mean_pm10[0][0]
df.filter(df.PM10.isNull()).show()
df1 = df1.na.fill(mean_pm10[0][0], subset=['PM10'])
df1.filter(df1.PM10.isNull()).show()

In [None]:
df1.filter(df1.PM10.isNull()).show()

In [None]:
df1.show()

In [None]:
# Let's collect the average. You'll notice that the collection returns the average in an interesting format.
mean_pm10 = df.select(mean(df['PM10'])).collect()
#mean_pm10
mean_pm10[0][0]
df.filter(df.PM10.isNull()).show()
df1 = df1.na.fill(mean_pm10[0][0], subset=['PM10'])
df1.filter(df1.PM10.isNull()).show()


In [None]:

df1=df1.drop('NO')
df1.show()

In [None]:


df1=df1.drop('NOX')
df1=df1.drop('NH3')
df1=df1.drop('CO')
df1=df1.drop('benzene')
df1=df1.drop('toluene')
df1=df1.drop('Xylene')

In [None]:
df1.show()

In [None]:
#Replace 0 for null on only population column 
df2 = df1.na.fill(value=0,subset=["AQI"])

In [None]:
#df1 = df1.na.fill(mean_pm10[0][0], subset=['PM10'])
df2.filter(df2.AQI.isNull()).show()

In [None]:
#Replace 0 for null on only population column 
#df2 = df2.na.fill(value=0,subset=["AQI"]).show()


In [None]:
df.show()

In [None]:
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func
df2.withColumn("AQIBucket1", func.last('AQIBucket', True).over(Window.partitionBy('City').orderBy('year').rowsBetween(-sys.maxsize, 0))).show()


In [None]:

###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import first

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(0, sys.maxsize)

# define the forward-filled column
filled_column = first(df['AQIBucket'], ignorenulls=True).over(window)

# do the fill
spark_df_filled = df.withColumn('AQIBucket', filled_column)

# show off our glorious achievements
spark_df_filled.orderBy('City', 'Date').show(10) 

In [None]:
spark_df_filled.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled.createOrReplaceTempView('pollution')

results2 = spark.sql("SELECT * FROM pollution where AQIBucket is null")
results2.show()

In [111]:
###backward fill
from pyspark.sql import Window
from pyspark.sql.functions import last

# define the window
window = Window.partitionBy('City')\
               .orderBy('Date')\
               .rowsBetween(-sys.maxsize ,0 )

# define the forward-filled column
filled_column = last(spark_df_filled['AQIBucket'], ignorenulls=True).over(window)

# do the fill
spark_df_filled2 = spark_df_filled.withColumn('AQIBucket', filled_column)

# show off our glorious achievements
spark_df_filled2.orderBy('City', 'Date').show(10)

22/10/09 11:00:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv
[Stage 221:>                                                        (0 + 1) / 1]

+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+
|     City|      Date|    PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|year|
+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+
|Ahmedabad|2015-01-01|67.45058|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     Poor|2015|
|Ahmedabad|2015-01-02|67.45058|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     Poor|2015|
|Ahmedabad|2015-01-03|67.45058|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     Poor|2015|
|Ahmedabad|2015-01-04|67.45058|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   1.0|null|     Poor|2015|
|Ahmedabad|2015-01-05|67.45058|null|  22.1|21.42| 37.76|null|  22.1|39.33| 39.31|   7.01|  18.89|  2.78|null|     Poor|2015|


                                                                                

In [112]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled2.createOrReplaceTempView('pollution')

results3 = spark.sql("SELECT * FROM pollution where AQIBucket is null")
results3.show()

22/10/09 11:00:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv
[Stage 224:>                                                        (0 + 1) / 1]

+----+----+----+----+---+---+---+---+---+---+---+-------+-------+------+---+---------+----+
|City|Date|PM25|PM10| NO|NO2|NOX|NH3| CO|SO2| O3|benzene|toluene|Xylene|AQI|AQIBucket|year|
+----+----+----+----+---+---+---+---+---+---+---+-------+-------+------+---+---------+----+
+----+----+----+----+---+---+---+---+---+---+---+-------+-------+------+---+---------+----+



                                                                                

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled2.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT AQIBucket, count(AQIBucket),city,year FROM pollution group by AQIbucket,city,year")
results4.show()

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_filled2.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT AQIBucket, count(AQIBucket),city,year FROM pollution where AQIBucket is null group by AQIbucket,city,year")
results4.show()

In [113]:
spark_df_filled2.show()

22/10/09 11:00:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv


+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+
|     City|      Date|    PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|year|
+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+
|Ahmedabad|2015-01-01|67.45058|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     Poor|2015|
|Ahmedabad|2015-01-02|67.45058|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     Poor|2015|
|Ahmedabad|2015-01-03|67.45058|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     Poor|2015|
|Ahmedabad|2015-01-04|67.45058|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   1.0|null|     Poor|2015|
|Ahmedabad|2015-01-05|67.45058|null|  22.1|21.42| 37.76|null|  22.1|39.33| 39.31|   7.01|  18.89|  2.78|null|     Poor|2015|


In [114]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder


indexer = StringIndexer(inputCol="AQIBucket", outputCol="AQIB_index").fit(spark_df_filled2)
spark_df_ind = indexer.transform(spark_df_filled2)
spark_df_ind.show()


#df3 = encoded


22/10/09 11:00:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, AQI_Bucket
 Schema: City, Date, AQIBucket
Expected: AQIBucket but found: AQI_Bucket
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv
22/10/09 11:00:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv


+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+
|     City|      Date|    PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|year|AQIB_index|
+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+
|Ahmedabad|2015-01-01|67.45058|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     Poor|2015|       2.0|
|Ahmedabad|2015-01-02|67.45058|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     Poor|2015|       2.0|
|Ahmedabad|2015-01-03|67.45058|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     Poor|2015|       2.0|
|Ahmedabad|2015-01-04|67.45058|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   1.0|null|     Poor|2015|       2.0|
|Ahmedabad|2015-01-05|67.45058|null|  22.1|21.42

In [None]:
# First, we have to register the DataFrame as a SQL temporary view.
spark_df_ind.createOrReplaceTempView('pollution')

results4 = spark.sql("SELECT AQIBucket ,AQIB_index, count(AQIB_index), city,year FROM pollution group by AQIBucket, AQIB_index, city,year")
results4.show()

In [115]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

encoder = OneHotEncoder(inputCol="AQIB_index", outputCol="AQIB_vec")
ohe = encoder.fit(spark_df_ind) # indexer is the existing dataframe, see the question
encoded = ohe.transform(spark_df_ind)
encoded.show()




22/10/09 11:01:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv


+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+-------------+
|     City|      Date|    PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|year|AQIB_index|     AQIB_vec|
+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+-------------+
|Ahmedabad|2015-01-01|67.45058|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-02|67.45058|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-03|67.45058|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-04|67.45058|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   

In [116]:
encoded.toPandas()

22/10/09 11:01:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv
                                                                                

Unnamed: 0,City,Date,PM25,PM10,NO,NO2,NOX,NH3,CO,SO2,O3,benzene,toluene,Xylene,AQI,AQIBucket,year,AQIB_index,AQIB_vec
0,Ahmedabad,2015-01-01,67.450577,,0.92,18.219999,17.150000,,0.92,27.639999,133.360001,0.00,0.020000,0.00,,Poor,2015,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)"
1,Ahmedabad,2015-01-02,67.450577,,0.97,15.690000,16.459999,,0.97,24.549999,34.060001,3.68,5.500000,3.77,,Poor,2015,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)"
2,Ahmedabad,2015-01-03,67.450577,,17.40,19.299999,29.700001,,17.40,29.070000,30.700001,6.80,16.400000,2.25,,Poor,2015,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)"
3,Ahmedabad,2015-01-04,67.450577,,1.70,18.480000,17.969999,,1.70,18.590000,36.080002,4.43,10.140000,1.00,,Poor,2015,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)"
4,Ahmedabad,2015-01-05,67.450577,,22.10,21.420000,37.759998,,22.10,39.330002,39.310001,7.01,18.889999,2.78,,Poor,2015,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.020000,50.939999,7.68,25.059999,19.540001,12.47,0.47,8.550000,23.299999,2.24,12.070000,0.73,41.0,Good,2020,4.0,"(0.0, 0.0, 0.0, 0.0, 1.0)"
29527,Visakhapatnam,2020-06-28,24.379999,74.089996,3.42,26.059999,16.530001,11.99,0.52,12.720000,30.139999,0.74,2.210000,0.38,70.0,Satisfactory,2020,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0)"
29528,Visakhapatnam,2020-06-29,22.910000,65.730003,3.45,29.530001,18.330000,10.71,0.48,8.420000,30.959999,0.01,0.010000,0.00,68.0,Satisfactory,2020,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0)"
29529,Visakhapatnam,2020-06-30,16.639999,49.970001,4.05,29.260000,18.799999,10.03,0.52,9.840000,28.299999,0.00,0.000000,0.00,54.0,Satisfactory,2020,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0)"


In [117]:

#dataframe columns 
encoded.columns

encoded.show()

22/10/09 11:01:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI, AQI_Bucket
 Schema: City, Date, PM25, PM10, NO, NO2, NOX, NH3, CO, SO2, O3, benzene, toluene, Xylene, AQI, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv


+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+-------------+
|     City|      Date|    PM25|PM10|    NO|  NO2|   NOX| NH3|    CO|  SO2|    O3|benzene|toluene|Xylene| AQI|AQIBucket|year|AQIB_index|     AQIB_vec|
+---------+----------+--------+----+------+-----+------+----+------+-----+------+-------+-------+------+----+---------+----+----------+-------------+
|Ahmedabad|2015-01-01|67.45058|null|  0.92|18.22| 17.15|null|  0.92|27.64|133.36|    0.0|   0.02|   0.0|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-02|67.45058|null|  0.97|15.69| 16.46|null|  0.97|24.55| 34.06|   3.68|    5.5|  3.77|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-03|67.45058|null|  17.4| 19.3|  29.7|null|  17.4|29.07|  30.7|    6.8|   16.4|  2.25|null|     Poor|2015|       2.0|(5,[2],[1.0])|
|Ahmedabad|2015-01-04|67.45058|null|   1.7|18.48| 17.97|null|   1.7|18.59| 36.08|   4.43|  10.14|   

In [118]:
from pyspark.ml.feature import VectorAssembler

inputCols = [
 'PM25',
 'year',
 'AQIB_index',
 'AQIB_vec']

outputCol = "features"
df_va = VectorAssembler(inputCols = inputCols, outputCol = outputCol)
encoded = df_va.transform(encoded)
encoded.select(['features']).toPandas().head(5)



22/10/09 11:02:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: City, Date, PM2.5, AQI_Bucket
 Schema: City, Date, PM25, AQIBucket
Expected: PM25 but found: PM2.5
CSV file: file:///home/ubuntu/722/aws-722-copy/Datasets/city_day.csv
                                                                                

Unnamed: 0,features
0,"(67.45057678222656, 2015.0, 2.0, 0.0, 0.0, 1.0..."
1,"(67.45057678222656, 2015.0, 2.0, 0.0, 0.0, 1.0..."
2,"(67.45057678222656, 2015.0, 2.0, 0.0, 0.0, 1.0..."
3,"(67.45057678222656, 2015.0, 2.0, 0.0, 0.0, 1.0..."
4,"(67.45057678222656, 2015.0, 2.0, 0.0, 0.0, 1.0..."
