## Topic:
- Dropping Columns
- Treating rows with null values
- Imputation of null values

In [1]:
!pip install pyspark



In [2]:
#import Spark API and creating cluster
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [3]:
#Reading dataset
df_pyspark_1 = spark.read.csv('C:/Users/ASUS/OneDrive/Desktop/DatasetR/Spark_test_01.csv',header=True,inferSchema=True)
df_pyspark_1.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|  KK|  31|        10| 30000|
|  AA|  30|         8| 25000|
|  BB|  29|         4| 20000|
|  CC|  24|         3| 20000|
|  DD|  21|         1| 15000|
|  PP|  23|         2| 18000|
|  MM|null|      null| 40000|
|null|  34|        10| 38000|
|null|  36|      null|  null|
+----+----+----------+------+



In [4]:
#checking schema
df_pyspark_1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
#dropping a column
df_pyspark_1.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [7]:
#Drop specific rows based on Null values
df_pyspark_1.na.drop().show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|  KK| 31|        10| 30000|
|  AA| 30|         8| 25000|
|  BB| 29|         4| 20000|
|  CC| 24|         3| 20000|
|  DD| 21|         1| 15000|
|  PP| 23|         2| 18000|
+----+---+----------+------+



In [8]:
### drop the row with all null values (we do not have such rows)
df_pyspark_1.na.drop(how="all").show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|  KK|  31|        10| 30000|
|  AA|  30|         8| 25000|
|  BB|  29|         4| 20000|
|  CC|  24|         3| 20000|
|  DD|  21|         1| 15000|
|  PP|  23|         2| 18000|
|  MM|null|      null| 40000|
|null|  34|        10| 38000|
|null|  36|      null|  null|
+----+----+----------+------+



In [11]:
#threshold = 2 , Atleast 2 non null values are allowed. Last row had 3 non-null values, hence got deleted
df_pyspark_1.na.drop(how="any", thresh=2).show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|  KK|  31|        10| 30000|
|  AA|  30|         8| 25000|
|  BB|  29|         4| 20000|
|  CC|  24|         3| 20000|
|  DD|  21|         1| 15000|
|  PP|  23|         2| 18000|
|  MM|null|      null| 40000|
|null|  34|        10| 38000|
+----+----+----------+------+



In [12]:
#Atleast 3 non-null values are allowed.
df_pyspark_1.na.drop(how="any", thresh=3).show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|  KK| 31|        10| 30000|
|  AA| 30|         8| 25000|
|  BB| 29|         4| 20000|
|  CC| 24|         3| 20000|
|  DD| 21|         1| 15000|
|  PP| 23|         2| 18000|
|null| 34|        10| 38000|
+----+---+----------+------+



In [15]:
#Subset - If there is any null value in Experience column, the whole record will get deleted
df_pyspark_1.na.drop(how="any", subset=['Experience']).show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|  KK| 31|        10| 30000|
|  AA| 30|         8| 25000|
|  BB| 29|         4| 20000|
|  CC| 24|         3| 20000|
|  DD| 21|         1| 15000|
|  PP| 23|         2| 18000|
|null| 34|        10| 38000|
+----+---+----------+------+



In [22]:
#Filling null value with "missing value"
df_pyspark_1.na.fill('Missing Value','Name').show()

+-------------+----+----------+------+
|         Name| Age|Experience|Salary|
+-------------+----+----------+------+
|           KK|  31|        10| 30000|
|           AA|  30|         8| 25000|
|           BB|  29|         4| 20000|
|           CC|  24|         3| 20000|
|           DD|  21|         1| 15000|
|           PP|  23|         2| 18000|
|           MM|null|      null| 40000|
|Missing Value|  34|        10| 38000|
|Missing Value|  36|      null|  null|
+-------------+----+----------+------+



In [24]:
#Using imputive function to handle null value
from pyspark.ml.feature import Imputer

imputer = Imputer(
inputCols=['Age','Experience','Salary'],
outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")


In [25]:
#Adding imputation columns to the df
imputer.fit(df_pyspark_1).transform(df_pyspark_1).show()

+----+----+----------+------+-----------+------------------+--------------+
|Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+----+----+----------+------+-----------+------------------+--------------+
|  KK|  31|        10| 30000|         31|                10|         30000|
|  AA|  30|         8| 25000|         30|                 8|         25000|
|  BB|  29|         4| 20000|         29|                 4|         20000|
|  CC|  24|         3| 20000|         24|                 3|         20000|
|  DD|  21|         1| 15000|         21|                 1|         15000|
|  PP|  23|         2| 18000|         23|                 2|         18000|
|  MM|null|      null| 40000|         28|                 5|         40000|
|null|  34|        10| 38000|         34|                10|         38000|
|null|  36|      null|  null|         36|                 5|         25750|
+----+----+----------+------+-----------+------------------+--------------+

