
In the notebook, we will work with a cricket commentary data.


In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.types as tp
from pyspark.sql import functions as F

In [2]:
sc = SparkContext()

In [3]:
spark = SparkSession.builder.getOrCreate()

---

#### `Read the CSV File`

---

In [4]:
data = spark.read.csv("dataset/ind-ban-comment.csv",inferSchema=True, header=True)

In [5]:
## check the schema of the dataframe
data.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- Batsman: integer (nullable = true)
 |-- Batsman_Name: string (nullable = true)
 |-- Bowler: integer (nullable = true)
 |-- Bowler_Name: string (nullable = true)
 |-- Commentary: string (nullable = true)
 |-- Detail: string (nullable = true)
 |-- Dismissed: double (nullable = true)
 |-- Id: integer (nullable = true)
 |-- Isball: boolean (nullable = true)
 |-- Isboundary: double (nullable = true)
 |-- Iswicket: double (nullable = true)
 |-- Over: double (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- ZAD: string (nullable = true)



In [6]:
# VIEW THE TOP 4 ROWS OF THE DATA USING THE SHOW FUNCTION

data.show(n=4)


+---+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------+-------------------+-------+
|_c0|Batsman|     Batsman_Name|Bowler|      Bowler_Name|          Commentary|Detail|Dismissed| Id|Isball|Isboundary|Iswicket|Over|Runs|Summary|          Timestamp|    ZAD|
+---+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------+-------------------+-------+
|  0|  28994|   Mohammed Shami| 63881|Mustafizur Rahman|OUT! Bowled! 5-fe...|     W|  28994.0|346|  true|      null|     1.0|49.6|   0|   null|2019-07-02 13:18:47|   null|
|  1|   5132|Bhuvneshwar Kumar| 63881|Mustafizur Rahman|WIDE AND RUN OUT!...|  W+wd|   5132.0|344|  true|      null|     1.0|49.6|   1|   null|2019-07-02 13:17:28|   null|
|  2|  28994|   Mohammed Shami| 63881|Mustafizur Rahman|Back of a length ...|  null|     null|343|  true|      null|    null|49.5|   1|   nu

---

#### `View only the following columns of the dataframe`

    - Batsman_Name
    - Bowler_Name
    - Dismissed
    - Isboundary
    - Runs

---

In [7]:

select_columns = ["Batsman_Name","Bowler_Name","Dismissed","Isboundary","Runs"]
data.select(*select_columns).show()

+-----------------+------------------+---------+----------+----+
|     Batsman_Name|       Bowler_Name|Dismissed|Isboundary|Runs|
+-----------------+------------------+---------+----------+----+
|   Mohammed Shami| Mustafizur Rahman|  28994.0|      null|   0|
|Bhuvneshwar Kumar| Mustafizur Rahman|   5132.0|      null|   1|
|   Mohammed Shami| Mustafizur Rahman|     null|      null|   1|
|Bhuvneshwar Kumar| Mustafizur Rahman|     null|      null|   1|
|         MS Dhoni| Mustafizur Rahman|   3676.0|      null|   0|
|         MS Dhoni| Mustafizur Rahman|     null|      null|   0|
|         MS Dhoni| Mustafizur Rahman|     null|      null|   0|
|         MS Dhoni|Mohammad Saifuddin|     null|      null|   1|
|         MS Dhoni|Mohammad Saifuddin|     null|       1.0|   4|
|         MS Dhoni|Mohammad Saifuddin|     null|      null|   0|
|         MS Dhoni|Mohammad Saifuddin|     null|      null|   0|
|         MS Dhoni|Mohammad Saifuddin|     null|       1.0|   4|
|         MS Dhoni|Mohamm

---

#### Find out the number of runs scored by each batsman

---

In [8]:

batsman_group = data.groupBy("Batsman_Name")
batsman_runs = batsman_group.agg(F.count("Runs").alias("Runs_Scored"))

In [9]:
# SHOW THE BATSMAN SCORED BY EACH RUN IN DESCENDING ORDER
order_batsman_runs = batsman_runs.orderBy("Runs_Scored",ascending = False)
order_batsman_runs.show()

+------------------+-----------+
|      Batsman_Name|Runs_Scored|
+------------------+-----------+
|      Rohit Sharma|         94|
|          KL Rahul|         93|
|   Shakib Al Hasan|         75|
|      Rishabh Pant|         43|
|Mohammad Saifuddin|         42|
|     Sabbir Rahman|         40|
|     Soumya Sarkar|         39|
|          MS Dhoni|         33|
|       Tamim Iqbal|         31|
|       Virat Kohli|         27|
|         Liton Das|         24|
|   Mushfiqur Rahim|         23|
|     Rubel Hossain|         11|
|    Dinesh Karthik|          9|
|  Mosaddek Hossain|          7|
|  Mashrafe Mortaza|          5|
| Bhuvneshwar Kumar|          4|
|    Mohammed Shami|          2|
|     Hardik Pandya|          2|
| Mustafizur Rahman|          1|
+------------------+-----------+



---

#### Which batsman scored the highest number of boundaries

---

In [10]:

def filter_nulls(x):
    if x is not None:
        return int(x)
    else:
        return 0
    
udf_filter = F.udf(f= filter_nulls, returnType = tp.IntegerType())
df_bound = data.withColumn("Boundary",udf_filter(data["Isboundary"]))

In [11]:
df_bound.groupBy("Batsman_Name").agg(F.sum("Boundary").alias("Boundaries")).orderBy("Boundaries",ascending = False).limit(1)

Batsman_Name,Boundaries
Rohit Sharma,12


---

**Define a `udf` function that will create a new column on the basis of following condition**

If the value of `Runs` is less than 2, then assign `A`, if value is between `3 to 5` then assign `B` else assign `C`


---

In [12]:

def runs_condition(x):
    if x < 2:
        return 'A'
    elif x >= 3 and x<= 5:
        return 'B'
    else:
        return 'C'

In [13]:
udf_runs = F.udf(f = runs_condition, returnType = tp.StringType())

df_runs = data.withColumn("categoryOfRuns", udf_runs(data['Runs']))


In [14]:
select_columns.append("categoryOfRuns")


NameError: name 'final' is not defined

In [15]:
df_runs.select(*select_columns).show()

+-----------------+------------------+---------+----------+----+--------------+
|     Batsman_Name|       Bowler_Name|Dismissed|Isboundary|Runs|categoryOfRuns|
+-----------------+------------------+---------+----------+----+--------------+
|   Mohammed Shami| Mustafizur Rahman|  28994.0|      null|   0|             A|
|Bhuvneshwar Kumar| Mustafizur Rahman|   5132.0|      null|   1|             A|
|   Mohammed Shami| Mustafizur Rahman|     null|      null|   1|             A|
|Bhuvneshwar Kumar| Mustafizur Rahman|     null|      null|   1|             A|
|         MS Dhoni| Mustafizur Rahman|   3676.0|      null|   0|             A|
|         MS Dhoni| Mustafizur Rahman|     null|      null|   0|             A|
|         MS Dhoni| Mustafizur Rahman|     null|      null|   0|             A|
|         MS Dhoni|Mohammad Saifuddin|     null|      null|   1|             A|
|         MS Dhoni|Mohammad Saifuddin|     null|       1.0|   4|             B|
|         MS Dhoni|Mohammad Saifuddin|  