In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master('local').getOrCreate()
sc = spark.sparkContext

In [2]:
players = spark.read.format('csv').option('header', True).load('./datasets/player.csv')

In [3]:
players.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [4]:
players.sample(0.1).show(5)

+---+-------------+--------------------+------------------+-------------------+------+------+
| id|player_api_id|         player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+--------------------+------------------+-------------------+------+------+
| 13|       163222|      Aaron Muirhead|            213568|1990-08-30 00:00:00|187.96|   168|
| 15|        75489|        Aaron Ramsey|            186561|1990-12-26 00:00:00| 177.8|   154|
| 22|       245653|Abdelfettah Boukh...|            202425|1986-10-22 00:00:00|185.42|   161|
| 30|        41659|    Abderrazak Jadid|            149241|1983-06-01 00:00:00| 177.8|   157|
| 39|       191784|        Abdoulaye Ba|            204826|1991-01-01 00:00:00|198.12|   174|
+---+-------------+--------------------+------------------+-------------------+------+------+
only showing top 5 rows



In [5]:
player_attributes = spark.read.format('csv').option('header', True).load('./datasets/player_Attributes.csv')

In [6]:
player_attributes.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- potential: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- attacking_work_rate: string (nullable = true)
 |-- defensive_work_rate: string (nullable = true)
 |-- crossing: string (nullable = true)
 |-- finishing: string (nullable = true)
 |-- heading_accuracy: string (nullable = true)
 |-- short_passing: string (nullable = true)
 |-- volleys: string (nullable = true)
 |-- dribbling: string (nullable = true)
 |-- curve: string (nullable = true)
 |-- free_kick_accuracy: string (nullable = true)
 |-- long_passing: string (nullable = true)
 |-- ball_control: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- sprint_speed: string (nullable = true)
 |-- agility: string (nullable = true)
 |-- reactions: string (nullable = true

In [7]:
player_attributes.count(), players.count()

(183978, 11060)

In [8]:
player_attributes = player_attributes.drop('id',
 'player_fifa_api_id','preferred_foot','attacking_work_rate',
 'defensive_work_rate',
 'crossing','sprint_speed','jumping','reactions',
 'balance')
player_attributes.columns

['player_api_id',
 'date',
 'overall_rating',
 'potential',
 'finishing',
 'heading_accuracy',
 'short_passing',
 'volleys',
 'dribbling',
 'curve',
 'free_kick_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'agility',
 'shot_power',
 'stamina',
 'strength',
 'long_shots',
 'aggression',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes']

In [9]:
players = players.drop('id', 'player_fifa_api_id')
players.columns

['player_api_id', 'player_name', 'birthday', 'height', 'weight']

In [10]:
player_attributes = player_attributes.dropna()
players = players.dropna()

In [11]:
from pyspark.sql.functions import udf

In [12]:
year_extract_fn = udf(lambda x: x.split('-')[0])

In [13]:
player_attributes = player_attributes.withColumn('year', year_extract_fn(player_attributes.date))

In [14]:
player_attributes.select('year').groupBy('year').agg({'year': 'count'}).show(4)

+----+-----------+
|year|count(year)|
+----+-----------+
|2016|      14098|
|2012|      12625|
|2014|      32792|
|2013|      38830|
+----+-----------+
only showing top 4 rows



### 1. Find the best striker

In [40]:
pa_2016 = player_attributes.filter(player_attributes.year == 2016)

#### How many players in year 2016?

In [16]:
pa_2016.select(pa_2016.player_api_id).distinct().count()

5586

In [17]:
pa_strikers_2016 = pa_2016.groupBy(pa_2016.player_api_id) \
    .agg({
         'finishing': 'avg',
         'dribbling': 'avg',
         'curve': 'avg',
         'shot_power': 'avg',
         'long_shots': 'avg'
    })

In [18]:
pa_strikers_2016.show(5)

+-------------+-----------------+---------------+-----------------+-----------------+---------------+
|player_api_id|       avg(curve)|avg(shot_power)|   avg(finishing)|   avg(dribbling)|avg(long_shots)|
+-------------+-----------------+---------------+-----------------+-----------------+---------------+
|       309726|60.77777777777778|           76.0|75.44444444444444|70.33333333333333|           67.0|
|        26112|             76.0|           76.0|             53.0|             73.0|           61.0|
|        38433|             78.5|           74.0|            68.25|             81.5|           73.0|
|       295060|             27.0|           40.0|             25.0|             56.0|           22.0|
|       161396|             63.0|           69.0|             29.0|             70.0|           58.0|
+-------------+-----------------+---------------+-----------------+-----------------+---------------+
only showing top 5 rows



In [19]:
pa_strikers_2016 = pa_strikers_2016 \
    .withColumnRenamed('avg(curve)', 'curve') \
    .withColumnRenamed('avg(shot_power)', 'shot_power') \
    .withColumnRenamed('avg(finishing)', 'finishing') \
    .withColumnRenamed('avg(dribbling)', 'dribbling') \
    .withColumnRenamed('avg(long_shots)', 'long_shot')

In [20]:
pa_strikers_2016.show(5)

+-------------+-----------------+----------+-----------------+-----------------+---------+
|player_api_id|            curve|shot_power|        finishing|        dribbling|long_shot|
+-------------+-----------------+----------+-----------------+-----------------+---------+
|       309726|60.77777777777778|      76.0|75.44444444444444|70.33333333333333|     67.0|
|        26112|             76.0|      76.0|             53.0|             73.0|     61.0|
|        38433|             78.5|      74.0|            68.25|             81.5|     73.0|
|       295060|             27.0|      40.0|             25.0|             56.0|     22.0|
|       161396|             63.0|      69.0|             29.0|             70.0|     58.0|
+-------------+-----------------+----------+-----------------+-----------------+---------+
only showing top 5 rows



In [21]:
pa_strikers_2016 = pa_strikers_2016.withColumn(
    'striker_grade',
    pa_strikers_2016.curve * 20/100 + pa_strikers_2016.long_shot * 10/100 + pa_strikers_2016.finishing * 50/100 + pa_strikers_2016.dribbling * 10/100 + pa_strikers_2016.long_shot * 10/100
)

In [22]:
pa_strikers_2016.filter(pa_strikers_2016.striker_grade > 70).sort('striker_grade', ascending=False).show(5)

+-------------+-----+----------+---------+---------+---------+-----------------+
|player_api_id|curve|shot_power|finishing|dribbling|long_shot|    striker_grade|
+-------------+-----+----------+---------+---------+---------+-----------------+
|        37412| 82.0|      87.0|     90.0|     89.0|     84.0|87.10000000000001|
|        30834| 86.0|      86.0|     85.0|     93.0|     90.0|             87.0|
|        38817| 84.0|      88.5|     88.0|    86.25|    87.25|           86.875|
|        39122| 85.0|      82.0|     87.0|     84.0|     84.0|85.70000000000002|
|       325916| 83.5|      77.0|     87.0|     89.0|     83.0|             85.7|
+-------------+-----+----------+---------+---------+---------+-----------------+
only showing top 5 rows



In [23]:
striker_details = pa_strikers_2016 \
    .select('player_api_id', 'striker_grade') \
    .join(players, players.player_api_id == pa_strikers_2016.player_api_id) \
    .sort('striker_grade', ascending=False)

In [24]:
striker_details.show(10)

+-------------+-----------------+-------------+------------------+-------------------+------+------+
|player_api_id|    striker_grade|player_api_id|       player_name|           birthday|height|weight|
+-------------+-----------------+-------------+------------------+-------------------+------+------+
|        37412|87.10000000000001|        37412|     Sergio Aguero|1988-06-02 00:00:00|172.72|   163|
|        30834|             87.0|        30834|      Arjen Robben|1984-01-23 00:00:00|180.34|   176|
|        38817|           86.875|        38817|      Carlos Tevez|1984-02-05 00:00:00|172.72|   157|
|        39122|85.70000000000002|        39122|             Jonas|1984-04-01 00:00:00|180.34|   165|
|       325916|             85.7|       325916|      Paulo Dybala|1993-11-15 00:00:00|175.26|   161|
|       129944|85.33333333333331|       129944|        Marco Reus|1989-05-31 00:00:00|180.34|   165|
|       164684|             85.0|       164684|   James Rodriguez|1991-07-12 00:00:00|180.3

In [25]:
from pyspark.sql.functions import broadcast

In [32]:
striker_details = players \
    .select('player_api_id', 'player_name') \
    .join(broadcast(pa_strikers_2016), players.player_api_id == pa_strikers_2016.player_api_id, 'inner') \
    .sort('striker_grade', ascending=False)

In [33]:
striker_details.show(5)

+-------------+-------------+-------------+-----+----------+---------+---------+---------+-----------------+
|player_api_id|  player_name|player_api_id|curve|shot_power|finishing|dribbling|long_shot|    striker_grade|
+-------------+-------------+-------------+-----+----------+---------+---------+---------+-----------------+
|        37412|Sergio Aguero|        37412| 82.0|      87.0|     90.0|     89.0|     84.0|87.10000000000001|
|        30834| Arjen Robben|        30834| 86.0|      86.0|     85.0|     93.0|     90.0|             87.0|
|        38817| Carlos Tevez|        38817| 84.0|      88.5|     88.0|    86.25|    87.25|           86.875|
|        39122|        Jonas|        39122| 85.0|      82.0|     87.0|     84.0|     84.0|85.70000000000002|
|       325916| Paulo Dybala|       325916| 83.5|      77.0|     87.0|     89.0|     83.0|             85.7|
+-------------+-------------+-------------+-----+----------+---------+---------+---------+-----------------+
only showing top 5 

### 2.Does height of player affect their heading attributes?

In [43]:
pa_heading_acc = player_attributes.groupBy(player_attributes.player_api_id) \
    .agg({
        'heading_accuracy': 'avg'
    }) \
    .withColumnRenamed('avg(heading_accuracy)', 'heading_accuracy')
pa_heading_acc.show(5)

+-------------+------------------+
|player_api_id|  heading_accuracy|
+-------------+------------------+
|        24504|             77.32|
|       309726|            69.625|
|        74468|              50.0|
|        26112|  69.6842105263158|
|        38433|57.833333333333336|
+-------------+------------------+
only showing top 5 rows



In [48]:
players_heading_acc = pa_heading_acc \
    .join(broadcast(players), pa_heading_acc.player_api_id == players.player_api_id) \
    .select('player_name', 'height', 'heading_accuracy')

In [49]:
players_heading_acc.show(5)

+--------------------+------+------------------+
|         player_name|height|  heading_accuracy|
+--------------------+------+------------------+
|Alessandro Gamberini|185.42|             77.32|
|      Andrea Belotti|180.34|            69.625|
|        Andrea Conti| 177.8|              50.0|
|      Benoit Cheyrou|182.88|  69.6842105263158|
|        Borja Valero|175.26|57.833333333333336|
+--------------------+------+------------------+
only showing top 5 rows



In [50]:
short_count = sc.accumulator(0)
med_count = sc.accumulator(0)
tall_count = sc.accumulator(0)

In [51]:
def count_players_by_height(row):
    height = float(row.height)
    if height <= 175:
        short_count.add(1)
    elif height > 175 and height <= 185:
        med_count.add(1)
    else:
        tall_count.add(1)

In [52]:
players_heading_acc.foreach(count_players_by_height)

In [54]:
short_count.value, med_count.value, tall_count.value

(1086, 5725, 3771)

In [55]:
short_ha_count = sc.accumulator(0)
med_ha_count = sc.accumulator(0)
tall_ha_count = sc.accumulator(0)

In [56]:
def count_players_by_ha(row):
    height = float(row.height)
    ha = float(row.heading_accuracy)
    if ha < 80:
        return
    if height <= 175:
        short_ha_count.add(1)
    elif height > 175 and height <= 185:
        med_ha_count.add(1)
    else:
        tall_ha_count.add(1)

In [57]:
players_heading_acc.foreach(count_players_by_ha)

In [58]:
short_ha_count.value, med_ha_count.value, tall_ha_count.value

(0, 44, 198)