In [11]:
from pyspark.sql.functions import when
from pyspark.sql import SparkSession

In [12]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [1]:
def read_file(filename):
    spark = init_spark()
    data = spark.read.csv("data/" + filename, header=True,inferSchema = True)
    return data

In [14]:
country = read_file("Country.csv")
# country.show()

In [15]:
league = read_file("League.csv").drop("country_id").withColumnRenamed("name", "league_name")
# league.show()

In [16]:
def create_leagues():
    leagues = country.join(league, on="id")
    return leagues

In [17]:
teams = read_file("Team.csv").withColumnRenamed("id","team_id")
# teams.show()

In [23]:
def create_matches():
    # home team
    match = read_file("Match.csv").withColumnRenamed("id","match_id")
    
    home_team_match = match.join(teams, teams.team_api_id == match.home_team_api_id)\
        .select("match_id","home_team_api_id", "team_long_name","home_team_goal","league_id","B365H","B365D","B365A")\
        .withColumnRenamed("team_long_name","home_team_long_name")
    # away team
    away_team_match = match.join(teams, teams.team_api_id == match.away_team_api_id)\
        .select("match_id","away_team_api_id","team_long_name","away_team_goal")\
        .withColumnRenamed("team_long_name", "away_team_long_name")
    # join home and away team
    matches = home_team_match.join(away_team_match, on = "match_id")
    old_match_cols = matches.columns
    match_cols = ["match_id","home_team_id","home_team","home_team_goal","league_id","B365H","B365D","B365A","away_team_id","away_team","away_team_goal"]
    for i in range(len(old_match_cols)):
        matches = matches.withColumnRenamed(old_match_cols[i],match_cols[i])
    # add result
    matches = matches.withColumn("winner", when(matches["home_team_goal"]>matches["away_team_goal"], matches["home_team"])
                             .otherwise(when(matches["home_team_goal"]< matches["away_team_goal"], matches["away_team"])
                                       .otherwise("Draw")))
    leagues = create_leagues()
    matches = matches.join(leagues,leagues.id == matches.league_id).drop("id","league_name").withColumnRenamed("name","country")
    return matches

In [22]:
print(create_matches())

['match_id', 'home_team_api_id', 'home_team_long_name', 'home_team_goal', 'league_id', 'B365H', 'B365D', 'B365A', 'away_team_api_id', 'away_team_long_name', 'away_team_goal']
DataFrame[match_id: string, home_team_id: string, home_team: string, home_team_goal: string, league_id: string, B365H: string, B365D: string, B365A: string, away_team_id: string, away_team: string, away_team_goal: string, winner: string, country: string]
