<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Spark_DF_Columen_Operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
sc = pyspark.SparkContext(appName="Col_Operations")

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [6]:
data = [('patty','spring', 'baseball', 64),
        ('matty', 'autumn', 'hockey', 90),
        ('cathy', 'spring', 'baseball', 100),
        ('sandy', 'autumn', 'soccer', 50),
        ('joey', 'summer', 'soccer', 73),
        ('tammy', 'spring', 'soccer', 86),
        ('marley', 'autumn', 'hockey', 100)
        ]

In [7]:
# Create an rdd
rdd = sc.parallelize(data)

In [8]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262

In [9]:
rdd.take(4)

[('patty', 'spring', 'baseball', 64),
 ('matty', 'autumn', 'hockey', 90),
 ('cathy', 'spring', 'baseball', 100),
 ('sandy', 'autumn', 'soccer', 50)]

In [10]:
# create a dataframe from an rdd and name the columns
df = spark.createDataFrame(rdd, ['player', 'season', 'sport', 'ranking'])

In [11]:
df.show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| patty|spring|baseball|     64|
| matty|autumn|  hockey|     90|
| cathy|spring|baseball|    100|
| sandy|autumn|  soccer|     50|
|  joey|summer|  soccer|     73|
| tammy|spring|  soccer|     86|
|marley|autumn|  hockey|    100|
+------+------+--------+-------+



In [13]:
# show the first 4 elements
df.show(4)

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| patty|spring|baseball|     64|
| matty|autumn|  hockey|     90|
| cathy|spring|baseball|    100|
| sandy|autumn|  soccer|     50|
+------+------+--------+-------+
only showing top 4 rows



In [14]:
# take the header of the dataframe
df.head()

Row(player='patty', season='spring', sport='baseball', ranking=64)

In [15]:
df.count()

7

In [16]:
df.describe().show()

+-------+------+------+--------+------------------+
|summary|player|season|   sport|           ranking|
+-------+------+------+--------+------------------+
|  count|     7|     7|       7|                 7|
|   mean|  null|  null|    null| 80.42857142857143|
| stddev|  null|  null|    null|18.884359867865463|
|    min| cathy|autumn|baseball|                50|
|    max| tammy|summer|  soccer|               100|
+-------+------+------+--------+------------------+



In [17]:
df.printSchema()

root
 |-- player: string (nullable = true)
 |-- season: string (nullable = true)
 |-- sport: string (nullable = true)
 |-- ranking: long (nullable = true)



In [19]:
df.select(['player', 'ranking'])

DataFrame[player: string, ranking: bigint]

In [20]:
df.select(['player', 'ranking']).show()

+------+-------+
|player|ranking|
+------+-------+
| patty|     64|
| matty|     90|
| cathy|    100|
| sandy|     50|
|  joey|     73|
| tammy|     86|
|marley|    100|
+------+-------+



In [22]:
df.filter(df['ranking'] > 80)

DataFrame[player: string, season: string, sport: string, ranking: bigint]

In [23]:
df.filter(df['ranking'] > 80).show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| matty|autumn|  hockey|     90|
| cathy|spring|baseball|    100|
| tammy|spring|  soccer|     86|
|marley|autumn|  hockey|    100|
+------+------+--------+-------+



In [24]:
df.filter(df['season'] == 'autumn').show()

+------+------+------+-------+
|player|season| sport|ranking|
+------+------+------+-------+
| matty|autumn|hockey|     90|
| sandy|autumn|soccer|     50|
|marley|autumn|hockey|    100|
+------+------+------+-------+



In [25]:
df.filter(df.season == 'autumn').show()

+------+------+------+-------+
|player|season| sport|ranking|
+------+------+------+-------+
| matty|autumn|hockey|     90|
| sandy|autumn|soccer|     50|
|marley|autumn|hockey|    100|
+------+------+------+-------+



In [27]:
# Create new column
df.select(df['player'], df['season'], df['sport'], df['ranking'], 
          (df['ranking'] - 15).alias('new_ranking')).show()

+------+------+--------+-------+-----------+
|player|season|   sport|ranking|new_ranking|
+------+------+--------+-------+-----------+
| patty|spring|baseball|     64|         49|
| matty|autumn|  hockey|     90|         75|
| cathy|spring|baseball|    100|         85|
| sandy|autumn|  soccer|     50|         35|
|  joey|summer|  soccer|     73|         58|
| tammy|spring|  soccer|     86|         71|
|marley|autumn|  hockey|    100|         85|
+------+------+--------+-------+-----------+



In [28]:
df.withColumn('lowered_ranking', df['ranking'] * 0.33).show()

+------+------+--------+-------+------------------+
|player|season|   sport|ranking|   lowered_ranking|
+------+------+--------+-------+------------------+
| patty|spring|baseball|     64|             21.12|
| matty|autumn|  hockey|     90|29.700000000000003|
| cathy|spring|baseball|    100|              33.0|
| sandy|autumn|  soccer|     50|              16.5|
|  joey|summer|  soccer|     73|             24.09|
| tammy|spring|  soccer|     86|28.380000000000003|
|marley|autumn|  hockey|    100|              33.0|
+------+------+--------+-------+------------------+



In [30]:
df.select(df['player'], df['ranking']).withColumn('lowered_ranking', df['ranking'] * 0.33).show()

+------+-------+------------------+
|player|ranking|   lowered_ranking|
+------+-------+------------------+
| patty|     64|             21.12|
| matty|     90|29.700000000000003|
| cathy|    100|              33.0|
| sandy|     50|              16.5|
|  joey|     73|             24.09|
| tammy|     86|28.380000000000003|
|marley|    100|              33.0|
+------+-------+------------------+



In [31]:
# Raise Error
df.select(df['player'], df['season']).withColumn('lowered_ranking', df['ranking'] * 0.33).show()

AnalysisException: ignored

In [33]:
df.select(df['player'], df['season'], 
          (df['ranking'] * 0.33).alias('new_ranking')).show()

+------+------+------------------+
|player|season|       new_ranking|
+------+------+------------------+
| patty|spring|             21.12|
| matty|autumn|29.700000000000003|
| cathy|spring|              33.0|
| sandy|autumn|              16.5|
|  joey|summer|             24.09|
| tammy|spring|28.380000000000003|
|marley|autumn|              33.0|
+------+------+------------------+



In [34]:
df.sort(df['ranking']).show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| sandy|autumn|  soccer|     50|
| patty|spring|baseball|     64|
|  joey|summer|  soccer|     73|
| tammy|spring|  soccer|     86|
| matty|autumn|  hockey|     90|
| cathy|spring|baseball|    100|
|marley|autumn|  hockey|    100|
+------+------+--------+-------+



In [36]:
df.sort(df['ranking'].desc()).show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
|marley|autumn|  hockey|    100|
| cathy|spring|baseball|    100|
| matty|autumn|  hockey|     90|
| tammy|spring|  soccer|     86|
|  joey|summer|  soccer|     73|
| patty|spring|baseball|     64|
| sandy|autumn|  soccer|     50|
+------+------+--------+-------+



In [38]:
df.sort(df.ranking).show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| sandy|autumn|  soccer|     50|
| patty|spring|baseball|     64|
|  joey|summer|  soccer|     73|
| tammy|spring|  soccer|     86|
| matty|autumn|  hockey|     90|
| cathy|spring|baseball|    100|
|marley|autumn|  hockey|    100|
+------+------+--------+-------+



In [39]:
df.sort(df.ranking.desc()).show()

+------+------+--------+-------+
|player|season|   sport|ranking|
+------+------+--------+-------+
| cathy|spring|baseball|    100|
|marley|autumn|  hockey|    100|
| matty|autumn|  hockey|     90|
| tammy|spring|  soccer|     86|
|  joey|summer|  soccer|     73|
| patty|spring|baseball|     64|
| sandy|autumn|  soccer|     50|
+------+------+--------+-------+

