## Installing PySpark

In [1]:
#%pip install pyspark

## Loading PySpaRK And Starting

In [2]:
import pyspark

In [3]:
import pandas as pd
pd.read_csv('Dataset/test1.csv')

Unnamed: 0,Name,age,Experience
0,Krish,31,10
1,Sudhanshu,30,8
2,Sunny,29,4


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

24/01/11 11:42:13 WARN Utils: Your hostname, codespaces-904232 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/01/11 11:42:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/11 11:42:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/11 11:42:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/01/11 11:42:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/01/11 11:42:18 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [6]:
spark

### Reading Dataset

In [7]:
#df_spark = spark.read.csv('Dataset/test1.csv')

## without Inferschema, all data will be of String datatype

#df_spark = spark.read.option('header','true').csv('Dataset/test1.csv',inferSchema = True)
df_spark = spark.read.csv('Dataset/test1.csv',inferSchema = True,header=True)


In [None]:
df_spark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [None]:
type(df_spark)

## Dataframe is a datastructure in which various actions can be performed\

pyspark.sql.dataframe.DataFrame

In [None]:
# Check Schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Select

In [None]:
### Get columns
df_spark.columns

['Name', 'age', 'Experience']

In [None]:
### Get top 2 rows
df_spark.head(2)

#output is returned as a list , where as in pandas it is returned as dataframe

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Sudhanshu', age=30, Experience=8)]

In [None]:
### Get columns 
df_spark.select('Name')

df_spark.select(['Name','Experience'])

## Return type : DataFrame

DataFrame[Name: string, Experience: int]

In [None]:
### using this will return a column

df_spark['Name']

Column<'Name'>

### Check Datatype

In [None]:
df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

### Describe

In [None]:
df_spark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



### Adding and Removing

In [None]:
### Adding columns

df_spark = df_spark.withColumn('Experience after 2 years', df_spark['Experience']+2)

In [None]:
df_spark

DataFrame[Name: string, age: int, Experience: int, Experience after 2 years: int]

In [None]:
### Dropping columns

df_spark = df_spark.drop('Experience after 2 years')

In [None]:
### Renaming columns

df_spark = df_spark.withColumnRenamed('Name','New Name')

In [None]:
df_spark.show()

+---------+---+----------+
| New Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



## Dealing with Mising Values

In [None]:
df2_spark = spark.read.csv('Dataset/test2.csv',header=True,inferSchema=True)

In [None]:
df2_spark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



### Removing rows with null 

In [None]:
df2_spark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [None]:
df2_spark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



24/01/11 11:40:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
df2_spark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [None]:
df2_spark.na.drop(how="any",thresh=2).show()

## max 2 null values must be present

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [None]:
df2_spark.na.drop(how="any",subset=['Name']).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
+---------+----+----------+------+



In [None]:
df2_spark.na.fill('Missing Values','Experience').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [None]:
imputer.fit(df2_spark).transform(df2_spark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         2| 18000|         21|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 6|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 6|         27285|
+---------+----+----------+------+-----------+------------------+--------------+



In [None]:
imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("median")
imputer.fit(df2_spark).transform(df2_spark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         2| 18000|         21|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         30|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         25000|
+---------+----+----------+------+-----------+------------------+--------------+



## Filtering

In [None]:
df3 = spark.read.csv('Dataset/test3.csv',header=True)
df3.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



### Filter Operations

In [None]:
# Salary of the people of less than 20000
df3.filter("Salary <= 20000").show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [None]:
df3.filter("Salary <= 20000").select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [None]:
df3.filter(df3['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [None]:
df3.filter((df3['Salary']<=20000) &
        (df3['Experience']>2)).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Sunny| 29|         4| 20000|
| Paul| 24|         3| 20000|
+-----+---+----------+------+



## Group By and Aggregate

In [None]:
df4 = spark.read.csv('Dataset/test4.csv',header=True,inferSchema=True)