In [9]:
import findspark
findspark.init()

In [10]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark").getOrCreate()
spark

In [11]:
data = [['tom', 10], ['nick', 15], ['juli', 14]]
df = spark.createDataFrame(data, ['Name', 'Age'])

In [12]:
df.show()

+----+---+
|Name|Age|
+----+---+
| tom| 10|
|nick| 15|
|juli| 14|
+----+---+



In [13]:
df.toPandas()

Unnamed: 0,Name,Age
0,tom,10
1,nick,15
2,juli,14


In [14]:
df.columns

['Name', 'Age']

In [15]:
df.count()

3

In [17]:
df = spark.read.csv("students.csv", header=True)
df.toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [18]:
# Aggregating data
df.groupBy("gender").agg({'math score': 'mean'}).show()

+------+------------------+
|gender|   avg(math score)|
+------+------------------+
|female|63.633204633204635|
|  male| 68.72821576763485|
+------+------------------+



In [19]:
from pyspark.sql import functions as F
df.groupBy("gender").agg(F.min("math score"), F.max("math score"), F.avg("math score")).show()

+------+---------------+---------------+------------------+
|gender|min(math score)|max(math score)|   avg(math score)|
+------+---------------+---------------+------------------+
|female|              0|             99|63.633204633204635|
|  male|            100|             99| 68.72821576763485|
+------+---------------+---------------+------------------+



### Spark Immutability
Spark DataFrame's are built on top of RDDs which are immutable in nature, hence Data frames are immutable in nature as well.

So if you make a change to a dataframe like adding a column or changing any of the values in the dataframe using the same naming convention, it will generate a new dataframe (with a new unique ID) instead of updating the existing data frame.

In [20]:
df.rdd.id()

73

In [21]:
df2 = df
df2.rdd.id()

73

In [22]:
# new df generated
df = df.withColumn('new_col', df['math score'] * 2)
df.rdd.id()

77

### Lazy Computation
As the name itself indicates its definition, lazy evaluation in Spark means that the execution will not start until it absolutuley HAS to. 
The benefit is saving resources and optimizing the Spark cluster overall. 

In [23]:
# This kind of commands won't be run.
df = df.withColumn('new_col', df['math score'] * 2)

# until this kind of commands
collect = df.collect()