- PySpark is an interface for Apache Spark in Python, often used for large scale data processing and machine learning.
- Why Apache Spark is good?
    - If we have huge amoung of data and max size of ram is 32/64 gb.
    - What if data is of 128gb and ram is of 32 gb. Hence we would process the data in some distributed manner or distributed system.
    - This is where apache spark helps us.
- It runs in a cluster mode i.e. distributed mode.

## Installation
- !pip install pyspark
- Before starting to work with pyspark we need to start a `Spark Session`.


In [None]:
# Creating a spark session
from pyspark.sql import SparkSession

# 'practice' is the name of the session
spark = SparkSession.builder.appName('practice').getOrCreate()

## Basics of Dataframe
# Read the dataset
df_pyspark = spark.read.csv('test1.csv')
df_pyspark
df_pyspark.show()

df_pyspark = spark.read.option('header', 'true').csv('test1.csv').show()

# See more information about columns
# i.e. check the schema
# Similar to df.info() in pandas
df_pyspark.printSchema()

# By default it read each value in the file as string
# To read values with proper datatypes, we need to
# set `inferSchema=True` while reading
df_pyspark = spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)
df_pyspark.printSchema()

# Another way to read
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

In [None]:
# Column name
df_pyspark.columns

df_pyspark.head(3)

# Get a column and see all ites elements
df_pyspark.select('Name') # Gives the return type
df_pyspark.select('Name').show() # Dsiplay's the dataframe

df_pyspark.select(['Name', 'Experience']) # Multiple Columns
df_pyspark.select(['Name', 'Experience']).show()

#### Slicing does not work in PySpark

df_pyspark['Name'] # Return type - Column, has no show function

# Check datatypes
df_pyspark.dtypes

# Describe option similar to pandas
df_pyspark.describe()
df_pyspark.describe().show() # In table format

# Adding columns in dataframe
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2).show()

# Need to assign if you want to save the changes
df_pyspark = df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2)

# Drop columns from dataframe
df_pyspark = df_pyspark.drop('Experience After 2 years') # By default takes a column name

# Rename a column
df_pyspark.withColumnRenamed('Name', 'New Name')

In [3]:
# PySpark Handling Missing values

spark = SparkSession.builder.appName('Practice').getOrCreate()

df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)
df_pyspark.show()

# Dropping Columns
df_pyspark.drop('Name')

# Whereever there is a null value it will drop that row
df_pyspark.na.drop().show()

## Filling missing values
# Where ever there is null, it will fill it
df_pyspark.na.fill('Missing value').show()
df_pyspark.na.fill('Missing value', ['Experience', 'Age']).show() # Subset also

# Dropping Rows

# Various Parameters in dropping functionalities
## Drop paramters
# any==how
df_pyspark.na.drop(how="all").show() # Drop those record that have all null values
# threshold
# At least two non null values should be persent, less than that dropped
df_pyspark.na.drop(how="all", threshold=2).show()
# subset - some specific subset of columns
df_pyspark.na.drop(how="all", subset=['Experience']).show()

# Handling Missing value by mean, median, and mode.
# Using an imputer to impute values
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
).setStratergy("mean")

# Fit and transform
imputer.fit(df_pyspark).transform(df_pyspark).show()

In [None]:
# PySpark Dataframes
'''
- Filter Operation
- & | ==
- ~
'''

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dataframe').getOrCreate()

df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)

df_pyspark.show()

# Filter Operations
## Way 1: Salary of people less than or equal to 20000
df_pyspark.filter("Salary<=20000").show()
df_pyspark.filter("Salary<=20000").select(['Name', 'age']).show()

# Way 2:
df_pyspark.filter(df_pyspark['Salary'] <= 20000).show()
df_pyspark.filter((df_pyspark['Salary'] <= 20000) & (df_pyspark['Salary'] >= 15000)).show()

df_pyspark.filter(~(df_pyspark['Salary'] <= 20000)).show()

In [None]:
# PySpark GroupBy and Aggregate functions
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Agg').getOrCreate()

df_pyspark = spark.read.csv('test3.csv', header=True, inferSchema=True)

df_pyspark.show()

df_pyspark.printSchema()

## Group By
### Grouped to find the maximum salary
df_pyspark.groupBy('Name').sum().show()

### Groupby department which gives maximum salary
df_pyspark.groupBy('Departments').sum().show()
df_pyspark.groupBy('Departments').mean().show()
df_pyspark.groupBy('Departments').count().show()

### Direct aggregate functions
df_pyspark.agg({'Salary': 'sum'}).show()

In [None]:
# Example of PySpark ML

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ML').getOrCreate()

training = spark.read.csv('test1.csv', header=True, inferSchema=True)

training.show()
training.printSchema()
training.columns

# VectorAssembler to group features
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['age','Experience'],outputCol='Independent Features')

output = assembler.transform(training)

output.show()

finalized_data = output.select('Independent Features', "Salary")

finalized_data.show()

# Train Test Split
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(train_data)

regressor.coefficients

regressor.intercept

# Prediction
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

pred_results.meanAbsoluteError
pred_results.meanSquaredError

## Handling Categorical Features
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='sex',outputCol='sex_indexed')
# multiple columns
indexer = StringIndexer(inputCols=['smoker','day','time'],outputCols=['smoker_indexed','day_indexed','time_indexed'])
df_r=indexer.fit(df_pyspark).transform(df_pyspark)
df_r.show()