## Importing necessary libraries
----

In [140]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

## Initializing Spark session :
-------

In [141]:
#   Initialize the Spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Exercise1") \
    .getOrCreate()

## Loading the dataset :
---

In [142]:
df = spark.read.format('org.apache.spark.sql.json') \
        .load("icc_results.json")

## Selecting team that face each other : teamA v teamB
----

In [145]:
teamA= df.select("scheduleEntry.team1.team.fullName")
teamA= teamA.dropna()
teamB= df.select("scheduleEntry.team2.team.fullName")
teamB= teamB.dropna()

teamA.show()
teamB.show()

+------------+
|    fullName|
+------------+
|South Africa|
|   Australia|
| New Zealand|
| New Zealand|
| New Zealand|
| New Zealand|
|   Australia|
| New Zealand|
|South Africa|
|   Australia|
|South Africa|
|   Australia|
|South Africa|
|   Australia|
| New Zealand|
|   Australia|
|South Africa|
| New Zealand|
|South Africa|
|   Australia|
+------------+
only showing top 20 rows

+-----------+
|   fullName|
+-----------+
|  Sri Lanka|
|      India|
|   Pakistan|
|   Pakistan|
|   Pakistan|
|   Pakistan|
|      India|
|West Indies|
|    England|
|      India|
|    England|
|      India|
|    England|
|      India|
|West Indies|
|      India|
|    England|
|West Indies|
|    England|
|      India|
+-----------+
only showing top 20 rows



## Creating a grouped dataframe of the number of occurences in the list above :
-----

In [146]:
gdfa = teamA.groupBy('fullName').count()
gdfa=gdfa.withColumnRenamed("count","countA")

gdfb = teamB.groupBy('fullName').count()
gdfb=gdfb.withColumnRenamed("count","countB")

gdfa.show()
gdfb.show()

+--------------------+------+
|            fullName|countA|
+--------------------+------+
|            Uva Next|     1|
|  Pakistan Under 19s|    21|
|       BCB Select XI|     3|
|              Sweden|     9|
|   Nigeria Under 19s|     7|
| Singapore Under 19s|     4|
|  Maldives Under 19s|     1|
|           Hampshire|     1|
|              Jersey|    25|
|         Philippines|     9|
|    Norway Under 19s|     2|
| Sunrisers Hyderabad|     3|
|           Singapore|    33|
|            Malaysia|    46|
|                Fiji|     8|
|Afghanistan Under...|    17|
|              Malawi|    10|
|  Scotland Under 19s|     9|
|Bangladesh Under 19s|    20|
|             Germany|    11|
+--------------------+------+
only showing top 20 rows

+--------------------+------+
|            fullName|countB|
+--------------------+------+
|            Uva Next|     1|
|  Pakistan Under 19s|    11|
|              Sweden|     6|
|   Nigeria Under 19s|    11|
| Singapore Under 19s|    13|
|  Maldives Un

## Merging count column from the grouped dataframes to get the total count of matches played by each country :
----

In [147]:
finaldf = gdfb.join(gdfa,"fullName",'outer')

finaldf = finaldf.withColumn('Number of Matches played',finaldf.countB + finaldf.countA)

finaldf = finaldf.drop('countA')

finaldf = finaldf.drop('countB')

finaldf.show()


+--------------------+------------------------+
|            fullName|Number of Matches played|
+--------------------+------------------------+
|         Afghanistan|                     207|
|Afghanistan Under...|                      38|
|           Argentina|                      15|
| Argentina Under 19s|                       4|
|       Auckland Aces|                    null|
|           Australia|                     439|
| Australia Under 19s|                      24|
|             Austria|                      15|
|       BCB Select XI|                    null|
|             Bahrain|                      25|
|   Bahrain Under 19s|                       9|
|          Bangladesh|                     275|
|Bangladesh Under 19s|                      32|
|             Belgium|                      17|
|   Belgium Under 19s|                       5|
|              Belize|                       9|
|             Bermuda|                      45|
|   Bermuda Under 19s|                  

## Finding out the country which played most number of matches:
---

In [148]:
result = finaldf.orderBy('Number of Matches played', ascending=False)

result.show()


+--------------------+------------------------+
|            fullName|Number of Matches played|
+--------------------+------------------------+
|               India|                     463|
|             England|                     443|
|           Australia|                     439|
|           Sri Lanka|                     432|
|            Pakistan|                     425|
|         New Zealand|                     399|
|         West Indies|                     381|
|        South Africa|                     376|
|            Zimbabwe|                     276|
|          Bangladesh|                     275|
|             Ireland|                     240|
|         Afghanistan|                     207|
|United Arab Emirates|                     173|
|            Scotland|                     163|
|         Netherlands|                     158|
|           Hong Kong|                     133|
|             Namibia|                     120|
|               Kenya|                  

# Result :
---

In [139]:
print('The most number of matches were played by {}'.format(result.collect()[0][0]),':',result.collect()[0][1])

The most number of matches were played by India : 463
