In [1]:
import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# DataFrame from data source

In [2]:
n = 'https://raw.githubusercontent.com/sehgalayush1/movie_revenue/master/ml_modules/data/BollywoodMovieDetail.csv'
f = 'BollywoodMovieDetail.csv'

sc.addFile(n)
movies = spark.read.option("header", True).option("inferSchema", "true").csv(pyspark.SparkFiles.get(f))

# columns only

In [4]:
movies.columns

['imdbId',
 'title',
 'releaseYear',
 'releaseDate',
 'genre',
 'writers',
 'actors',
 'directors',
 'sequel',
 'hitFlop']

# select single column

In [9]:
ratingCol = movies['hitFlop']
movies.select(ratingCol).show(5)

+-------+
|hitFlop|
+-------+
|      2|
|      6|
|      1|
|      4|
|      1|
+-------+
only showing top 5 rows



In [22]:
ratingCol.desc()

Column<'hitFlop DESC NULLS LAST'>

# select single column using expression

In [10]:
movies.select("hitFlop").show(5)

+-------+
|hitFlop|
+-------+
|      2|
|      6|
|      1|
|      4|
|      1|
+-------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import expr
movies.select(expr("hitFlop")).show(5)

+-------+
|hitFlop|
+-------+
|      2|
|      6|
|      1|
|      4|
|      1|
+-------+
only showing top 5 rows



# Apply condition

In [13]:
 movies.select(expr("hitFlop > 5")).show(3)

+-------------+
|(hitFlop > 5)|
+-------------+
|        false|
|         true|
|        false|
+-------------+
only showing top 3 rows



In [15]:
 movies.select(movies["hitFlop"] > 5).show(3)

+-------------+
|(hitFlop > 5)|
+-------------+
|        false|
|         true|
|        false|
+-------------+
only showing top 3 rows



In [16]:
# Add column

In [18]:
movies.withColumn("Good Movies to Watch", expr("hitFlop > 5")).show(3)

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+--------------------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|Good Movies to Watch|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+--------------------+
|tt0118578|              Albela|       2001|20 Apr 2001|             Romance|Honey Irani (scre...|Govinda | Aishwar...|       Deepak Sareen|     0|      2|               false|
|tt0169102|Lagaan: Once Upon...|       2001|08 May 2002|Adventure | Drama...|Ashutosh Gowarike...|Aamir Khan | Grac...|  Ashutosh Gowariker|     0|      6|                true|
|tt0187279|Meri Biwi Ka Jawa...|       2004|02 Jul 2004|     Action | Comedy|                 N/A|Akshay Kumar | Sr

# Sorting

In [23]:
 movies.sort(ratingCol.desc()).show(3)

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|tt0284137|Gadar: Ek Prem Katha|       2001|15 Jun 2001|Action | Drama | ...|   Shaktimaan Talwar|Sunny Deol | Amee...|         Anil Sharma|     0|      9|
|tt1187043|            3 Idiots|       2009|25 Dec 2009|      Comedy | Drama|Rajkumar Hirani |...|Aamir Khan | Kare...|     Rajkumar Hirani|     0|      9|
|tt1833673|            Dhoom: 3|       2013|20 Dec 2013|Action | Crime | ...|Vijay Krishna Ach...|Aamir Khan | Katr...|Vijay Krishna Ach...|     1|      9|
+---------+--------------------+-----------+-----------+--------