In [261]:
from ipynb.fs.full.data_extraction import read_file,init_spark
from pyspark.sql import Row
from pyspark.sql.functions import datediff,year
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib qt

In [262]:
spark = init_spark()
spark.conf.set("spark.sql.crossJoin.enabled", "true")

In [263]:
players = read_file("Player.csv")
leagues = read_file("League.csv")
player_attributes = read_file("Player_Attributes.csv")
match = read_file("Match.csv")

In [264]:
players_and_attributes = players.join(player_attributes, on="player_api_id")\
                            .select("player_api_id","player_name","date","birthday","overall_rating"
                                    ,"ball_control","dribbling","stamina","vision","sprint_speed"
                                   ,"shot_power","short_passing","long_passing")

In [265]:
players_and_attributes = players_and_attributes.withColumn("age",(datediff(players_and_attributes["date"]
                                                  ,players_and_attributes["birthday"])/365)
                                  .cast("integer")).drop("birthday")

In [266]:
age_and_ratings = players_and_attributes\
                .filter((players_and_attributes["age"] >=16) & (players_and_attributes["age"]<=36))\
                .groupBy("age").count()\
                .orderBy("age")\
                .toPandas()
plt.plot(age_and_ratings["age"],age_and_ratings["count"]/1000,"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Number of ratings")
plt.title("Age of player during the rating")

Text(0.5, 1.0, 'Age of player during the rating')

In [267]:
age_and_stamina = players_and_attributes\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(stamina)").withColumnRenamed("avg(stamina)","avg_stamina")\
                    .toPandas()


plt.plot(age_and_stamina["age"],age_and_stamina["avg_stamina"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average stamina")
plt.title("Average Stamina of player over the years")

Text(0.5, 1.0, 'Average Stamina of player over the years')

In [268]:
age_and_dribbling = players_and_attributes\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(dribbling)").withColumnRenamed("avg(dribbling)","avg_dribbling")\
                    .toPandas()

plt.plot(age_and_dribbling["age"],age_and_dribbling["avg_dribbling"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average dribbling")
plt.title("Average dribbling ability of player over the years")

Text(0.5, 1.0, 'Average dribbling ability of player over the years')

In [269]:
age_and_ball_control= players_and_attributes\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(ball_control)").withColumnRenamed("avg(ball_control)","avg_ball_control")\
                    .toPandas()
plt.plot(age_and_ball_control["age"],age_and_ball_control["avg_ball_control"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average ball control")
plt.title("Average ball control ability of player over the years")

Text(0.5, 1.0, 'Average ball control ability of player over the years')

In [270]:
age_and_vision= players_and_attributes\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(vision)").withColumnRenamed("avg(vision)","avg_vision")\
                    .toPandas()
plt.plot(age_and_vision["age"],age_and_vision["avg_vision"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average vision")
plt.title("Average vision of player over the years")

Text(0.5, 1.0, 'Average vision of player over the years')

In [271]:
age_and_sprint_speed= players_and_attributes\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(sprint_speed)").withColumnRenamed("avg(sprint_speed)","avg_sprint_speed")\
                    .toPandas()

plt.plot(age_and_sprint_speed["age"],age_and_sprint_speed["avg_sprint_speed"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average sprint speed")
plt.title("Average sprint speed of player over the years")

Text(0.5, 1.0, 'Average sprint speed of player over the years')

In [272]:
age_and_passing = players_and_attributes.withColumn("passing", (players_and_attributes["short_passing"]+players_and_attributes["long_passing"])/2)\
                    .filter((players_and_attributes["age"]>=16) & (players_and_attributes["age"]<=36))\
                    .groupBy("age").mean().orderBy("age")\
                    .select("age","avg(passing)").withColumnRenamed("avg(passing)","avg_passing")\
                    .toPandas()

plt.plot(age_and_passing["age"],age_and_passing["avg_passing"],"r--", linewidth=2)
plt.xlabel("age")
plt.ylabel("Average passing")
plt.title("Average passing ability of player over the years")

Text(0.5, 1.0, 'Average passing ability of player over the years')

In [276]:
attributes = ["overall_rating","ball_control","dribbling","stamina","vision","sprint_speed","passing"]

colors = ["red","blue","green","purple","black","cyan","yellow"]
plt.xlabel("age")
plt.ylabel("attribute average")

plt.plot(age_and_ratings["age"],age_and_ratings["count"]/1000,color="red",linewidth=1,label="overall_rating")
plt.plot(age_and_stamina["age"],age_and_stamina["avg_stamina"],color="blue",linewidth=1,label="stamina")
plt.plot(age_and_dribbling["age"],age_and_dribbling["avg_dribbling"],color="green",linewidth=1,label="dribbling")
plt.plot(age_and_ball_control["age"],age_and_ball_control["avg_ball_control"],color="purple",linewidth=1,label="ball_control")
plt.plot(age_and_vision["age"],age_and_vision["avg_vision"],color="black",linewidth=1,label="vision")
plt.plot(age_and_sprint_speed["age"],age_and_sprint_speed["avg_sprint_speed"],color="cyan",linewidth=1,label="sprint_speed")
plt.plot(age_and_passing["age"],age_and_passing["avg_passing"],color="yellow",linewidth=1,label="passing")
plt.legend()

<matplotlib.legend.Legend at 0x7f76bc311c88>