# PySpark

## SparkSession 
To use Spark, we need to first create SparkSession

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Dataframe').getOrCreate()
spark


22/08/17 12:59:31 WARN Utils: Your hostname, FFT-ThinkPad-L490 resolves to a loopback address: 127.0.1.1; using 192.168.29.4 instead (on interface wlp5s0)
22/08/17 12:59:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/17 12:59:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read CSV

### Way 1

In [3]:
# read the dataset
df_pyspark = spark.read.option("header", "true").csv(
    "test.csv", inferSchema=True)


### Way 2

In [17]:
# read the dataset
df_pyspark = spark.read.csv("test.csv", header=True, inferSchema=True)


## Basic DataFrame operations

In [4]:
# check the schema
df_pyspark.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
# check the columns
df_pyspark.columns


['Name', 'Age', 'Experience', 'Salary']

In [6]:
# get the head
df_pyspark.head(3)


[Row(Name='Rohit', Age=22, Experience=1, Salary=20000),
 Row(Name='Deep', Age=21, Experience=0, Salary=43000),
 Row(Name=None, Age=None, Experience=None, Salary=None)]

In [36]:
# selecting multiple columns
df_pyspark.select("Name").show()


+-----+
| Name|
+-----+
|Rohit|
| Deep|
| Anik|
+-----+



In [34]:
# selecting multiple columns
df_pyspark.select(["Name", "Age"]).show()


+-----+---+
| Name|Age|
+-----+---+
|Rohit| 22|
| Deep| 21|
| Anik| 23|
+-----+---+



In [37]:
# check the datatypes
df_pyspark.dtypes


[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [39]:
# describe the DataFrame
df_pyspark.describe().show()


+-------+-----+----+------------------+
|summary| Name| Age|        Experience|
+-------+-----+----+------------------+
|  count|    3|   3|                 3|
|   mean| null|22.0|               2.0|
| stddev| null| 1.0|2.6457513110645907|
|    min| Anik|  21|                 0|
|    max|Rohit|  23|                 5|
+-------+-----+----+------------------+



In [50]:
# add columns to DataFrame (not in-place operation)
df_pyspark = df_pyspark.withColumn("Age after 2 Years", df_pyspark["age"]+2)
df_pyspark.show()


+-----+---+----------+-----------------+
| Name|Age|Experience|Age after 2 Years|
+-----+---+----------+-----------------+
|Rohit| 22|         1|               24|
| Deep| 21|         0|               23|
| Anik| 23|         5|               25|
+-----+---+----------+-----------------+



In [55]:
# drop columns (not in-place operation)
df_pyspark = df_pyspark.drop("Age after 2 Years")
df_pyspark.show()


+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Rohit| 22|         1|
| Deep| 21|         0|
| Anik| 23|         5|
+-----+---+----------+



In [57]:
# rename columns (again not in-place operation)
df_pyspark.withColumnRenamed("Name", "NewName").show()


+-------+---+----------+
|NewName|Age|Experience|
+-------+---+----------+
|  Rohit| 22|         1|
|   Deep| 21|         0|
|   Anik| 23|         5|
+-------+---+----------+



# PySpark handling missing values 

In [71]:
# read the dataset
df_pyspark = spark.read.option("header", "true").csv(
    "test.csv", inferSchema=True)


In [72]:
df_pyspark.show()


+-------+----+----------+
|   Name| Age|Experience|
+-------+----+----------+
|  Rohit|  22|         1|
|   Deep|  21|         0|
|   null|null|      null|
|   Anik|  23|         5|
|Subhayu|  22|         8|
| Rijita|null|      null|
| Arpita|null|         3|
|   Nilu|  27|      null|
|   null|  32|      null|
|   null|  52|        27|
+-------+----+----------+



### Deleting rows with null values

In [73]:
# drop all rows with null values
df_pyspark.na.drop().show()


+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Rohit| 22|         1|
|   Deep| 21|         0|
|   Anik| 23|         5|
|Subhayu| 22|         8|
+-------+---+----------+



In [70]:
# how = any (deletes rows having atleast 1 null value)
df_pyspark.na.drop(how="any").show()


+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Rohit| 22|         1|
|   Deep| 21|         0|
|   Anik| 23|         5|
|Subhayu| 22|         8|
+-------+---+----------+



In [75]:
# how = all (only deletes rows having all null values)
df_pyspark.na.drop(how="all").show()


+-------+----+----------+
|   Name| Age|Experience|
+-------+----+----------+
|  Rohit|  22|         1|
|   Deep|  21|         0|
|   Anik|  23|         5|
|Subhayu|  22|         8|
| Rijita|null|      null|
| Arpita|null|         3|
|   Nilu|  27|      null|
|   null|  32|      null|
|   null|  52|        27|
+-------+----+----------+



In [84]:
# thresh (delete rows with atleast thresh number of non-null values)
df_pyspark.na.drop(how="any", thresh=2).show()


+-------+----+----------+
|   Name| Age|Experience|
+-------+----+----------+
|  Rohit|  22|         1|
|   Deep|  21|         0|
|   Anik|  23|         5|
|Subhayu|  22|         8|
| Arpita|null|         3|
|   Nilu|  27|      null|
|   null|  52|        27|
+-------+----+----------+



In [86]:
# subset (delete rows with null values in specified columns)
df_pyspark.na.drop(how="any", subset=["Name", "Age"]).show()


+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  Rohit| 22|         1|
|   Deep| 21|         0|
|   Anik| 23|         5|
|Subhayu| 22|         8|
|   Nilu| 27|      null|
+-------+---+----------+



### Filling null values

In [101]:
df_pyspark.na.fill(value="NA", subset=["Name"]).show()


+-------+----+----------+
|   Name| Age|Experience|
+-------+----+----------+
|  Rohit|  22|         1|
|   Deep|  21|         0|
|     NA|null|      null|
|   Anik|  23|         5|
|Subhayu|  22|         8|
| Rijita|null|      null|
| Arpita|null|         3|
|   Nilu|  27|      null|
|     NA|  32|      null|
|     NA|  52|        27|
+-------+----+----------+



In [121]:
df_pyspark.na.fill({
    'Age': 25, 
    'Name': 'Unknown',
    'Experience': 0, 
    'Salary': 0
    }
).show()


+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Rohit| 22|         1| 20000|
|   Deep| 21|         0| 43000|
|Unknown| 25|         0|     0|
|   Anik| 23|         5|  8000|
|Subhayu| 22|         8| 42000|
| Rijita| 25|         0| 26000|
| Arpita| 25|         3| 23000|
|   Nilu| 27|    100000|     0|
|Unknown| 32|         0| 13000|
|Unknown| 52|        27| 18000|
+-------+---+----------+------+



In [103]:
from pyspark.ml.feature import Imputer

cols = ['Age', 'Experience']
imputer = Imputer(
    inputCols=['Age', 'Experience'],
    outputCols=[f"{col}_imputed" for col in cols],
).setStrategy("mean")


In [104]:
# add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()


+-------+----+----------+-----------+------------------+
|   Name| Age|Experience|Age_imputed|Experience_imputed|
+-------+----+----------+-----------+------------------+
|  Rohit|  22|         1|         22|                 1|
|   Deep|  21|         0|         21|                 0|
|   null|null|      null|         28|                 7|
|   Anik|  23|         5|         23|                 5|
|Subhayu|  22|         8|         22|                 8|
| Rijita|null|      null|         28|                 7|
| Arpita|null|         3|         28|                 3|
|   Nilu|  27|      null|         27|                 7|
|   null|  32|      null|         32|                 7|
|   null|  52|        27|         52|                27|
+-------+----+----------+-----------+------------------+



## PySpark DataFrames Filter Operations
- Filter Operation
- &,|,==
- ~

### Filter Operation

In [106]:
# read the dataset
df_pyspark = spark.read.option("header", "true").csv(
    "test.csv", inferSchema=True)
df_pyspark.show()


+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Rohit|  22|         1| 20000|
|   Deep|  21|         0| 43000|
|   null|null|      null|  null|
|   Anik|  23|         5|  8000|
|Subhayu|  22|         8| 42000|
| Rijita|null|      null| 26000|
| Arpita|null|         3| 23000|
|   Nilu|  27|    100000|  null|
|   null|  32|      null| 13000|
|   null|  52|        27| 18000|
+-------+----+----------+------+



In [118]:
# select Name, Age where Salary is greater than or equal to 20000
df_pyspark.filter("Salary>=20000").select(["Name", "Age"]).show()


+-------+----+
|   Name| Age|
+-------+----+
|  Rohit|  22|
|   Deep|  21|
|Subhayu|  22|
| Rijita|null|
| Arpita|null|
+-------+----+



In [122]:
df_pyspark.filter((df_pyspark['Salary']>=20000) & ()).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Rohit|  22|         1| 20000|
|   Deep|  21|         0| 43000|
|Subhayu|  22|         8| 42000|
| Rijita|null|      null| 26000|
| Arpita|null|         3| 23000|
+-------+----+----------+------+



22/07/09 19:49:18 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 10113220 ms exceeds timeout 120000 ms
22/07/09 19:49:18 WARN SparkContext: Killing executors is not supported by current scheduler.
