<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Spark_DataFrame_manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
sc = pyspark.SparkContext(appName="Col_manipulation")

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [24]:
data = [('patty','spring', 'baseball', 64),
        ('patty','autumn', 'soccer', 78),
        ('matty', 'autumn', 'hockey', 90),
        ('matty', 'spring', 'soccer', 64),
        ('cathy', 'spring', 'baseball', 100),
        ('cathy', 'autumn', 'hockey', 78),
        ('sandy', 'autumn', 'soccer', 50),
        ('joey', 'summer', 'soccer', 73),
        ('tammy', 'spring', 'soccer', 86),
        ('marley', 'autumn', 'hockey', 100) ]

In [25]:
# Create an rdd
rdd = sc.parallelize(data)

In [26]:
# create a dataframe from an rdd and name the columns
df = spark.createDataFrame(rdd, ['player', 'season', 'sport', 'ranking'])

In [27]:
df.show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| patty|spring|baseball|     64|
| patty|autumn|  soccer|     78|
| matty|autumn|  hockey|     90|
| matty|spring|  soccer|     64|
| cathy|spring|baseball|    100|
| cathy|autumn|  hockey|     78|
| sandy|autumn|  soccer|     50|
|  joey|summer|  soccer|     73|
| tammy|spring|  soccer|     86|
|marley|autumn|  hockey|    100|
+------+------+--------+-------+



In [28]:
# Show average (mean) ranking
df.agg({'ranking' : 'avg'}).show()

+------------+
|avg(ranking)|
+------------+
|        78.3|
+------------+



In [29]:
# Show average (mean) ranking
df.agg({'ranking' : 'sum'}).show()

+------------+
|sum(ranking)|
+------------+
|         783|
+------------+



In [30]:
# Show average (mean) ranking
df.agg({'ranking' : 'min'}).show()

+------------+
|min(ranking)|
+------------+
|          50|
+------------+



In [31]:
# Show average (mean) ranking
df.agg({'ranking' : 'max'}).show()

+------------+
|max(ranking)|
+------------+
|         100|
+------------+



In [32]:
# Show average (mean) ranking
df.agg({'ranking' : 'stddev'}).show()

+------------------+
|   stddev(ranking)|
+------------------+
|16.248418726482623|
+------------------+



In [33]:
# Show average (mean) ranking
df.agg({'ranking' : 'variance'}).show()

+-----------------+
|variance(ranking)|
+-----------------+
|264.0111111111111|
+-----------------+



In [34]:
# Show average (mean) ranking
df.agg({'ranking' : 'count'}).show()

+--------------+
|count(ranking)|
+--------------+
|            10|
+--------------+



In [35]:
# Show the mean, min, max for ranking
df.agg( F.mean(df.ranking).alias('mean'),
       F.min(df.ranking).alias('min'),
       F.max(df.ranking).alias('max')).show()

+----+---+---+
|mean|min|max|
+----+---+---+
|78.3| 50|100|
+----+---+---+



In [36]:
df.groupby('player').agg({'ranking':'mean','sport':'count'}).show()

+------+------------+------------+
|player|avg(ranking)|count(sport)|
+------+------------+------------+
|marley|       100.0|           1|
| sandy|        50.0|           1|
|  joey|        73.0|           1|
| tammy|        86.0|           1|
| cathy|        89.0|           2|
| matty|        77.0|           2|
| patty|        71.0|           2|
+------+------------+------------+



In [37]:
df.groupby('player').agg({'ranking':'mean','ranking':'count'}).show()

+------+--------------+
|player|count(ranking)|
+------+--------------+
|marley|             1|
| sandy|             1|
|  joey|             1|
| tammy|             1|
| cathy|             2|
| matty|             2|
| patty|             2|
+------+--------------+

