# Introduction

In [27]:
# import pyspark and create a spark session

In [28]:
import pyspark

In [29]:
from pyspark.sql import SparkSession

In [30]:
# create a spark cluster

In [31]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [32]:
# Check the version of your cluster
spark

In [33]:
# option - to make your first row as header

In [34]:
# Reading a dataset in pyspark
df = spark.read.csv("C:\\Users\\Shree123\\MACHINE LEARNING PROJECT TYCS\\Consumer_Price_Index.csv" )

In [35]:
df.show()

+----+-------------+----+----+--------------------+
| _c0|          _c1| _c2| _c3|                 _c4|
+----+-------------+----+----+--------------------+
|  id|       Entity|Code|Year|Consumer price in...|
|8236|United States| USA|1960|           13.563061|
|8237|United States| USA|1961|           13.708283|
|8238|United States| USA|1962|           13.872615|
|8239|United States| USA|1963|            14.04459|
+----+-------------+----+----+--------------------+



In [36]:
df1 = spark.read.option('header','true').csv("C:\\Users\\Shree123\\MACHINE LEARNING PROJECT TYCS\\Consumer_Price_Index.csv")

In [37]:
df1.show()

+----+-------------+----+----+---------------------------------+
|  id|       Entity|Code|Year|Consumer price index (2010 = 100)|
+----+-------------+----+----+---------------------------------+
|8236|United States| USA|1960|                        13.563061|
|8237|United States| USA|1961|                        13.708283|
|8238|United States| USA|1962|                        13.872615|
|8239|United States| USA|1963|                         14.04459|
+----+-------------+----+----+---------------------------------+



In [38]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [39]:
# To print the Datatypes of each feature.
df1.printSchema()

root
 |-- id: string (nullable = true)
 |-- Entity: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Consumer price index (2010 = 100): string (nullable = true)



In [40]:
# inferSchema - aligns the datatypes

In [41]:
# To align datatypes with the dataset
df2 = spark.read.option('header','true').csv("C:\\Users\\Shree123\\MACHINE LEARNING PROJECT TYCS\\Consumer_Price_Index.csv" , inferSchema = True)
df3 = spark.read.csv( "C:\\Users\\Shree123\\MACHINE LEARNING PROJECT TYCS\\Consumer_Price_Index.csv" , header = True , inferSchema = True)

In [42]:
df2.printSchema()
df3.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Entity: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Consumer price index (2010 = 100): double (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- Entity: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Consumer price index (2010 = 100): double (nullable = true)



In [43]:
# first 2 rows
df3.head(2)

[Row(id=8236, Entity='United States', Code='USA', Year=1960, Consumer price index (2010 = 100)=13.563061),
 Row(id=8237, Entity='United States', Code='USA', Year=1961, Consumer price index (2010 = 100)=13.708283)]

In [44]:
# Indexing in spark - Single Column
df3.select('id').show()

+----+
|  id|
+----+
|8236|
|8237|
|8238|
|8239|
+----+



In [45]:
# Multiple Columns
df3.select(['id', 'Code' , 'Year']).show()

+----+----+----+
|  id|Code|Year|
+----+----+----+
|8236| USA|1960|
|8237| USA|1961|
|8238| USA|1962|
|8239| USA|1963|
+----+----+----+



In [46]:
#To display summary statistics
df3.describe().show()

+-------+------------------+-------------+----+------------------+---------------------------------+
|summary|                id|       Entity|Code|              Year|Consumer price index (2010 = 100)|
+-------+------------------+-------------+----+------------------+---------------------------------+
|  count|                 4|            4|   4|                 4|                                4|
|   mean|            8237.5|         NULL|NULL|            1961.5|               13.797137249999999|
| stddev|1.2909944487358056|         NULL|NULL|1.2909944487358056|              0.20785934570020317|
|    min|              8236|United States| USA|              1960|                        13.563061|
|    max|              8239|United States| USA|              1963|                         14.04459|
+-------+------------------+-------------+----+------------------+---------------------------------+



In [47]:
# Adding columns in the df - withColumn()

In [48]:
import numpy as np

In [49]:
# CPI after 2 years
df3 = df3.withColumn('CPI_after_2_years', df3['Consumer price index (2010 = 100)']*df3['Year']/1000)

In [50]:
df3.show()

+----+-------------+----+----+---------------------------------+------------------+
|  id|       Entity|Code|Year|Consumer price index (2010 = 100)| CPI_after_2_years|
+----+-------------+----+----+---------------------------------+------------------+
|8236|United States| USA|1960|                        13.563061|       26.58359956|
|8237|United States| USA|1961|                        13.708283|      26.881942963|
|8238|United States| USA|1962|                        13.872615|       27.21807063|
|8239|United States| USA|1963|                         14.04459|27.569530169999997|
+----+-------------+----+----+---------------------------------+------------------+



In [51]:
# Dropping the Column
df3 = df3.drop('CPI_after_2_years')
df3

DataFrame[id: int, Entity: string, Code: string, Year: int, Consumer price index (2010 = 100): double]

In [66]:
df1 = spark.read.csv("C:\\Users\\Shree123\\MACHINE LEARNING PROJECT TYCS\\Unemployment_Ratio.csv" , header = True , inferSchema = True)

In [67]:
df1.show()

+----+-------------+----+----+-------------------------------------------------------------------+
| _c0|       Entity|Code|Year|Unemployment, total (% of total labor force) (modeled ILO estimate)|
+----+-------------+----+----+-------------------------------------------------------------------+
|5859|United States| USA|1991|                                                                6.8|
|5860|United States| USA|1992|                                                                7.5|
|5861|United States| USA|1993|                                                                6.9|
|5862|United States| USA|1994|                                                               6.12|
|5863|United States|NULL|NULL|                                                                5.5|
|5864|United States| USA|1996|                                                               NULL|
|5865|United States| USA|1997|                                                               NULL|
|5866|Unit

In [68]:
# Dropping Null values
df1.na.drop().show()

+----+-------------+----+----+-------------------------------------------------------------------+
| _c0|       Entity|Code|Year|Unemployment, total (% of total labor force) (modeled ILO estimate)|
+----+-------------+----+----+-------------------------------------------------------------------+
|5859|United States| USA|1991|                                                                6.8|
|5860|United States| USA|1992|                                                                7.5|
|5861|United States| USA|1993|                                                                6.9|
|5862|United States| USA|1994|                                                               6.12|
|5866|United States| USA|1998|                                                                5.9|
|5867|United States|  US|1999|                                                                7.0|
+----+-------------+----+----+-------------------------------------------------------------------+



In [57]:
# how in drop // any - drops everything in the row , all - only if all the values are null

In [81]:
df.na.drop(how = 'any').show()
print('\n')
df.na.drop(how = 'all').show()

+----+-------------+----+----+-------------------------------------------------------------------+
| _c0|       Entity|Code|Year|Unemployment, total (% of total labor force) (modeled ILO estimate)|
+----+-------------+----+----+-------------------------------------------------------------------+
|5859|United States| USA|1991|                                                                6.8|
|5860|United States| USA|1992|                                                                7.5|
|5861|United States| USA|1993|                                                                6.9|
|5862|United States| USA|1994|                                                               6.12|
|5866|United States| USA|1998|                                                                5.9|
|5867|United States|  US|1999|                                                                7.0|
+----+-------------+----+----+-------------------------------------------------------------------+



+----+-

In [79]:
# thresh - If a row has more than threshold null then it is dropped
df1.na.drop(thresh = 2 , how = 'any').show()

+----+-------------+----+----+-------------------------------------------------------------------+
| _c0|       Entity|Code|Year|Unemployment, total (% of total labor force) (modeled ILO estimate)|
+----+-------------+----+----+-------------------------------------------------------------------+
|5859|United States| USA|1991|                                                                6.8|
|5860|United States| USA|1992|                                                                7.5|
|5861|United States| USA|1993|                                                                6.9|
|5862|United States| USA|1994|                                                               6.12|
|5863|United States|NULL|NULL|                                                                5.5|
|5864|United States| USA|1996|                                                               NULL|
|5865|United States| USA|1997|                                                               NULL|
|5866|Unit

In [83]:
# Subset
df.na.drop(how = 'all', subset = ['Year']).show()

+----+-------------+----+----+-------------------------------------------------------------------+
| _c0|       Entity|Code|Year|Unemployment, total (% of total labor force) (modeled ILO estimate)|
+----+-------------+----+----+-------------------------------------------------------------------+
|5859|United States| USA|1991|                                                                6.8|
|5860|United States| USA|1992|                                                                7.5|
|5861|United States| USA|1993|                                                                6.9|
|5862|United States| USA|1994|                                                               6.12|
|5864|United States| USA|1996|                                                               NULL|
|5865|United States| USA|1997|                                                               NULL|
|5866|United States| USA|1998|                                                                5.9|
|5867|Unit

In [90]:
df = df.withColumnRenamed('Unemployment, total (% of total labor force) (modeled ILO estimate)','Unemployment_Rate')
df.show()

+----+-------------+----+----+-----------------+
| _c0|       Entity|Code|Year|Unemployment_Rate|
+----+-------------+----+----+-----------------+
|5859|United States| USA|1991|              6.8|
|5860|United States| USA|1992|              7.5|
|5861|United States| USA|1993|              6.9|
|5862|United States| USA|1994|             6.12|
|5863|United States|NULL|NULL|              5.5|
|5864|United States| USA|1996|             NULL|
|5865|United States| USA|1997|             NULL|
|5866|United States| USA|1998|              5.9|
|5867|United States|  US|1999|              7.0|
+----+-------------+----+----+-----------------+



In [139]:
# Task - Handle Missing values for categories and Numeric Columns and then fit_transform and also create unique indexes for the Code category.

In [159]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import Imputer, StringIndexer
from pyspark.ml import Pipeline

In [143]:
df = df.drop('New_Code')

In [163]:
placeholder = 'placeholder'
df = df.na.fill(placeholder , subset = ['Code'])
most_frequent = df.groupBy('Code').count().orderBy('count' , ascending = False).first()[0]
print('Most frequent Value:' , most_frequent , '\n')
print('-'*20)
# Replace placeholder with the most frequent.
new_df = df.withColumn('new_Code',when(col('Code') == placeholder , most_frequent).otherwise(col('Code')))
print('Before and After')
new_df.select(['Code','new_Code']).show()
new_df.show()
print('-'*20)
# Imputer
imputer = Imputer(
    inputCols = ['Unemployment_Rate'],
    outputCols = ['{}_imputed'.format(c) for c in ['Unemployment_Rate']]
    ).setStrategy('mean')
# Indexer for the Code feature
indexer = StringIndexer(inputCols = ['Code'] , outputCols = ['Mapped_Code'] , handleInvalid = 'keep')
# Create a pipeline to fit both the transformations
pipeline = Pipeline(stages = [imputer] + [indexer])
# fit and transform
model = pipeline.fit(new_df)
transformed_df = model.transform(new_df)
print('After Indexing and Imputing')
transformed_df.show()

Most frequent Value: USA 

--------------------
Before and After
+-----------+--------+
|       Code|new_Code|
+-----------+--------+
|        USA|     USA|
|        USA|     USA|
|        USA|     USA|
|        USA|     USA|
|placeholder|     USA|
|        USA|     USA|
|        USA|     USA|
|        USA|     USA|
|         US|      US|
+-----------+--------+

+-------------+-----------+----+-----------------+-------------------------+--------+
|       Entity|       Code|Year|Unemployment_Rate|Unemployment_Rate_Imputed|new_Code|
+-------------+-----------+----+-----------------+-------------------------+--------+
|United States|        USA|1991|              6.8|                      6.8|     USA|
|United States|        USA|1992|              7.5|                      7.5|     USA|
|United States|        USA|1993|              6.9|                      6.9|     USA|
|United States|        USA|1994|             6.12|                     6.12|     USA|
|United States|placeholder|NULL| 