In [53]:
from pyspark.sql.functions import when
from pyspark.sql import SparkSession
from ipynb.fs.full.data_extraction import create_leagues,init_spark,create_matches
import datetime
import matplotlib.pyplot as plt
import heapq
%matplotlib qt
import numpy as np
from matplotlib.figure import Figure

In [54]:
sc = init_spark().sparkContext
leagues = create_leagues()
matches = create_matches()
matches.show(5)

+------------+-------------+--------------+---------+------------+-----------------+--------------+-------------+-------+
|home_team_id|    home_team|home_team_goal|league_id|away_team_id|        away_team|away_team_goal|       winner|country|
+------------+-------------+--------------+---------+------------+-----------------+--------------+-------------+-------+
|       10189|FC Schalke 04|             3|     7809|        9790|     Hamburger SV|             2|FC Schalke 04|Germany|
|        9875|       Napoli|             1|    10257|        8530|          Catania|             0|       Napoli|  Italy|
|        8530|      Catania|             2|    10257|        8690|  Reggio Calabria|             0|      Catania|  Italy|
|        9989|    Lierse SK|             0|        1|        9985|Standard de Liège|             0|         Draw|Belgium|
|        9880|       Cesena|             0|    10257|        9857|          Bologna|             2|      Bologna|  Italy|
+------------+----------

In [55]:
def get_all_leagues():
    countries = leagues.select("name").collect()
    countries = list(map(lambda x: x.name, countries))
    return countries

In [56]:
def get_all_teams_in_league(matches_df):
    matches_df.cache()
    all_leagues = get_all_leagues()
    all_teams_in_leagues = {}
    for league in all_leagues:
        list_of_teams = matches_df.filter(matches_df["country"] == league).select("home_team").collect()
        result = sc.parallelize(list_of_teams).map(lambda x: x.home_team).distinct().collect()
        all_teams_in_leagues[league] = result
    return all_teams_in_leagues


In [57]:
def get_mean_home_win_percentage(home_win_percentage_by_team):
    mean = 0
    for key,value in home_win_percentage_by_team.items():
        mean += value
    return mean/len(home_win_percentage_by_team.keys())

In [58]:
def get_win_percentage():
    all_teams_in_league = get_all_teams_in_league(matches)
    home_win_percentage_by_team = {}
    matches.cache()
    for league in all_teams_in_league.keys():
        matches_in_league = matches.filter(matches["country"] == league)
        temp_count = 0
        temp_sum = 0
        for team in all_teams_in_league[league]:
            home_matches_of_team = matches_in_league.filter(matches["home_team"] == team)
            total_home_matches = home_matches_of_team.count()
            home_matches_won = home_matches_of_team.filter(home_matches_of_team["winner"] == team)
            total_home_matches_won = home_matches_won.count()
            home_win_percentage_by_team[team] = (league,total_home_matches_won/total_home_matches)
    return home_win_percentage_by_team

In [59]:
print(datetime.datetime.now())
home_win_percentage_by_team = get_win_percentage()
print(datetime.datetime.now())

2019-03-27 18:33:25.658628
2019-03-27 18:36:38.467827


In [60]:
all_leagues = get_all_leagues()
home_win_percentage_by_league = sc.parallelize(home_win_percentage_by_team.items())\
.map(lambda x: (x[0],x[1][0],x[1][1])).map(lambda x: (x[1], [(x[0],x[2])]))\
.reduceByKey(lambda x,y:x+y).map(lambda x: (x[0], [element[1] for element in x[1]]))\
.map(lambda x: (x[0], sum(x[1])/len(x[1]))).collectAsMap()

home_win_percentage_by_league

{'Netherlands': 0.43591036414565837,
 'Poland': 0.4335137085137086,
 'Switzerland': 0.40960724678205424,
 'Germany': 0.37245798319327733,
 'Spain': 0.43313397129186604,
 'Portugal': 0.37659566725598076,
 'Italy': 0.40320588599397716,
 'Belgium': 0.423161547810303,
 'England': 0.4014079315936901,
 'France': 0.3870068027210884,
 'Scotland': 0.38329922843239833}

In [61]:
plt.ylim(0,1)
plt.plot(home_win_percentage_by_league.keys(),home_win_percentage_by_league.values())

[<matplotlib.lines.Line2D at 0x7f0df6c44748>]

In [62]:
top_five = sc.parallelize(home_win_percentage_by_team.items())\
.map(lambda x: (x[0],x[1][0],x[1][1])).map(lambda x: (x[1], [(x[0],x[2])]))\
.reduceByKey(lambda x,y:x+y).map(lambda x: (x[0],heapq.nlargest(5,x[1], key=lambda t:t[1]))).collectAsMap()


In [63]:
bottom_five = sc.parallelize(home_win_percentage_by_team.items())\
.map(lambda x: (x[0],x[1][0],x[1][1])).map(lambda x: (x[1], [(x[0],x[2])]))\
.reduceByKey(lambda x,y:x+y).map(lambda x: (x[0],heapq.nsmallest(5,x[1], key=lambda t:t[1]))).collectAsMap()

print(top_five["England"])
print(bottom_five["England"])

[('Manchester United', 0.7631578947368421), ('Manchester City', 0.743421052631579), ('Chelsea', 0.6644736842105263), ('Arsenal', 0.6381578947368421), ('Tottenham Hotspur', 0.5789473684210527)]
[('Reading', 0.21052631578947367), ('Queens Park Rangers', 0.2631578947368421), ('Blackpool', 0.2631578947368421), ('Cardiff City', 0.2631578947368421), ('Middlesbrough', 0.2631578947368421)]


In [64]:
# top_five_values = sc.parallelize(top_five.items())\
# .map(lambda x: (x[0],[element[1] for element in x[1]])).collectAsMap()

# bottom_five_values = sc.parallelize(bottom_five.items())\
# .map(lambda x: (x[0],[element[1] for element in x[1]])).collectAsMap()

# print(bottom_five_values["England"])
# fig = plt.figure(figsize=(20,20))
# ax1 = fig.add_subplot(221)
# ax1.plot(top_five_values["Spain"], label = "top_five")
# ax1.plot(bottom_five_values["Spain"], label = "bottom_five")
# plt.legend()
# ax1.set_title("Spain")
# # plt.show()

# ax2 = fig.add_subplot(222)
# ax2.plot(top_five_values["England"])
# ax2.plot(bottom_five_values["England"])
# ax2.set_title("England")

# ax3 = fig.add_subplot(225)
# ax3.plot(top_five_values["England"])
# ax3.plot(bottom_five_values["England"])
# ax3.set_title("England")

# plt.show()