In [1]:
from ipynb.fs.full.data_extraction import init_spark, read_file
from pyspark.sql.functions import count,when,isnan,dayofmonth, month, year,col,udf,struct,lit
from pyspark.sql.types import IntegerType,LongType
from datetime import datetime
from pyspark.ml.feature import StringIndexer, VectorAssembler,IndexToString,VectorIndexer,MinMaxScaler
from pyspark.ml.classification import (LogisticRegression, RandomForestClassifier, DecisionTreeClassifier,
                                        MultilayerPerceptronClassifier,NaiveBayes)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

DataFrame[match_id: string, home_team_id: int, home_team: string, home_team_goal: int, league_id: int, B365H: double, B365D: double, B365A: double, away_team_id: int, away_team: string, away_team_goal: int, winner: string, country: string]


In [2]:
spark = init_spark()

In [3]:
def prepare_data():
    start = datetime.now()
    match = read_file("Match.csv")
    player_attributes = read_file("Player_Attributes.csv")
    players = read_file("Player.csv")
    required_columns = ["match_api_id","home_team_api_id","away_team_api_id","date","home_team_goal","away_team_goal","home_player_1", "home_player_2", "home_player_3", "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11","B365H", "B365D", "B365A", "BWH", "BWD", "BWA", "IWH", 
                "IWD", "IWA", "LBH", "LBD", "LBA", "PSH", "PSD", "PSA", "WHH", "WHD", "WHA", "SJH", "SJD", 
                "SJA", "VCH", "VCD", "VCA", "GBH", "GBD", "GBA", "BSH", "BSD", "BSA"]
    match = match.select(required_columns)
    match = match.na.drop()
    player_attributes = player_attributes.na.drop()
    end = datetime.now()
    print("Generated matches and player attributes in "+str((end-start).total_seconds())+" secs")
    return match,player_attributes

In [4]:
def group_ratings_by_mean(player_attributes):
    start = datetime.now()
    grouped_rating = player_attributes.groupBy("player_api_id").mean().select("player_api_id","avg(overall_rating)")\
    .withColumnRenamed("avg(overall_rating)","mean_rating").orderBy("mean_rating",ascending=False)
    end = datetime.now()
    print("Grouped ratings in "+str((end-start).total_seconds())+" secs")
    return grouped_rating

In [5]:
def merge_ratings_with_matches(match,grouped_rating):
    start = datetime.now()
    all_players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11"]
    match_new = match    
    match.cache()
    grouped_rating.cache()
    for player in all_players:
        player_df = match.select("match_api_id",player)
        joined_df = player_df.join(grouped_rating, player_df[player] == grouped_rating["player_api_id"]).drop("player_api_id")\
        .withColumnRenamed("mean_rating",player+"_mean_rating")
        match_new = match_new.join(joined_df, on = "match_api_id").drop(match_new[player])
#         match_new.select("match_api_id",player,player+"_mean_rating").show(1)
    match_new.cache()
    end = datetime.now()
    print("Added ratings to matches in "+str((end-start).total_seconds())+" secs")
    return match_new

In [6]:
def get_goals(match_new,home=True):
    start = datetime.now()
    if home:
        # goals scored at home
        home_goals = match_new.select("home_team_api_id","home_team_goal","away_team_goal").groupBy("home_team_api_id").sum()\
        .withColumnRenamed("sum(home_team_goal)","home_team_home_goals_scored")\
        .withColumnRenamed("sum(away_team_goal)","home_team_home_goals_conceded")\
        .withColumnRenamed("home_team_api_id","team_id")\
        .select("team_id","home_team_home_goals_scored","home_team_home_goals_conceded")
        end = datetime.now()
        print("Got home goals in "+str((end-start).total_seconds())+" secs")
        return home_goals
    else:
        # goals scored away
        away_goals = match_new.select("away_team_api_id","home_team_goal","away_team_goal").groupBy("away_team_api_id").sum()\
        .withColumnRenamed("sum(home_team_goal)","away_team_away_goals_conceded")\
        .withColumnRenamed("sum(away_team_goal)","away_team_away_goals_scored")\
        .withColumnRenamed("away_team_api_id","team_id")\
        .select("team_id","away_team_away_goals_conceded","away_team_away_goals_scored")
        end = datetime.now()
        print("Got away goals in "+str((end-start).total_seconds())+" secs")
        return away_goals


In [7]:
def get_goal_difference(home_goals,away_goals,match_new):
    start = datetime.now()
    # goal difference
    goal_difference = home_goals.join(away_goals, on="team_id")
    goal_difference = goal_difference.withColumn("goal_diff",(goal_difference["home_team_home_goals_scored"]+goal_difference["away_team_away_goals_scored"])
                                    - (goal_difference["home_team_home_goals_conceded"]+goal_difference["away_team_away_goals_conceded"]))\
                        .withColumnRenamed("home_team_api_id","team_id")
    # update home team goal difference to original df
    match_with_goals_scored = match_new.join(goal_difference, match_new["home_team_api_id"] == goal_difference["team_id"])\
                                .withColumnRenamed("goal_diff","home_team_goal_diff").drop("team_id","home_team_home_goals_scored","home_team_home_goals_conceded"
                                                          ,"away_team_away_goals_conceded","away_team_away_goals_scored")
    # update away team goal difference to original df
    match_with_goals_scored = match_with_goals_scored.join(goal_difference, match_with_goals_scored["away_team_api_id"] == goal_difference["team_id"])\
                                .withColumnRenamed("goal_diff","away_team_goal_diff").drop("team_id","home_team_home_goals_scored","home_team_home_goals_conceded"
                                                          ,"away_team_away_goals_conceded","away_team_away_goals_scored")
    match_with_goals_scored.cache()
    end = datetime.now()
    print("Got goal difference in "+str((end-start).total_seconds())+" secs")
    return match_with_goals_scored

In [8]:
def get_results(match_new):
    start = datetime.now()
    home_results = match_new.select("home_team_api_id","home_team_goal","away_team_goal").withColumnRenamed("home_team_api_id","team_id")\
        .withColumn("home_result",
                    when(match_new["home_team_goal"]>match_new["away_team_goal"],"win").otherwise(
                        when(match_new["home_team_goal"]<match_new["away_team_goal"],"lost").otherwise("draw")))
    # get away team results - win, loss or draw
    away_results = match_new.select("away_team_api_id","home_team_goal","away_team_goal").withColumnRenamed("away_team_api_id","team_id")\
    .withColumn("away_result",
                    when(match_new["home_team_goal"]>match_new["away_team_goal"],"lost").otherwise(
                        when(match_new["home_team_goal"]<match_new["away_team_goal"],"win").otherwise("draw")))
    end = datetime.now()
    print("Got match results in "+str((end-start).total_seconds())+" secs")
    return home_results,away_results

# # home_wins.groupBy("team_id","home_result").count()\
# # .filter((home_wins["home_result"] == "win") | (home_wins["home_result"] == "lost"))\
# # .filter(home_wins["team_id"] == 9987).show()
# #
# team_results = home_results.join(away_results,on="team_id")

In [9]:
def get_wins(home_results,away_results,match_with_goals_scored):
    start = datetime.now()
    # get number of home wins
    home_wins = home_results.select("team_id","home_result")
    home_wins = home_wins.filter(home_wins["home_result"] == "win").groupBy("team_id").count().withColumnRenamed("count","home_wins")
    # get number of away wins
    away_wins = away_results.select("team_id","away_result")
    away_wins = away_wins.filter(away_wins["away_result"] == "win").groupBy("team_id").count().withColumnRenamed("count","away_wins")
    # calculate total wins of team
    total_wins = home_wins.join(away_wins,on="team_id")
    total_wins = total_wins.withColumn("wins", total_wins["home_wins"]+ total_wins["away_wins"])
    # update home team wins to original match df
    match_with_wins = match_with_goals_scored.join(total_wins, match_with_goals_scored["home_team_api_id"] == total_wins["team_id"])\
    .drop("team_id","home_wins","away_wins").withColumnRenamed("wins","home_team_wins")
    # update away team wins to original match df
    match_with_wins = match_with_wins.join(total_wins, match_with_wins["away_team_api_id"] == total_wins["team_id"])\
    .drop("team_id","home_wins","away_wins").withColumnRenamed("wins","away_team_wins")
    match_with_wins.cache()
    end = datetime.now()
    print("Got wins in "+str((end-start).total_seconds())+" secs")
    return match_with_wins

In [10]:
def get_match_label(match_with_wins):
    start = datetime.now()
    match_with_label = match_with_wins.withColumn("match_label", when(match_with_wins["home_team_goal"]> match_with_wins["away_team_goal"],"win")
                                             .otherwise(when(match_with_wins["home_team_goal"]<match_with_wins["away_team_goal"],"loss").otherwise("draw"))                                         )
    match_with_label.cache()
    end = datetime.now()
    print("Got match labels in "+str((end-start).total_seconds())+" secs")
    return match_with_label

In [11]:
def betting_probabilities(match_with_label,book_keeper):
    start = datetime.now()
    match_with_probs = match_with_label
    for bk in book_keeper:
        match_with_probs = match_with_probs.withColumn(bk, 1/match_with_probs[bk])
    end = datetime.now()
    print("Got betting probabilities in "+str((end-start).total_seconds())+" secs")
    return match_with_probs


In [21]:
def get_train_test_data(match_with_probs,train_ratio,test_ratio,book_keeper):
    start = datetime.now()
    match_data = match_with_probs
    indexer = StringIndexer(inputCol="match_label",outputCol= "label")
    match_data = indexer.fit(match_data).transform(match_data)
    features = [x for x in match_data.columns if x not in ["home_team_goal","away_team_goal","date","label","match_label","match_api_id"
                                                       ,"home_team_api_id","away_team_api_id"]]
#     print(features)
    assembler = VectorAssembler(inputCols = features, outputCol = "features")
    match_data = assembler.transform(match_data)
    final_data = match_data.select("features","label")
    train,test = final_data.randomSplit([train_ratio,test_ratio],seed=1234)
    train.cache()
    test.cache()
    end = datetime.now()
    print("Generated training and testing data in "+str((end-start).total_seconds())+" secs")
    return train,test

In [13]:
match,player_attributes = prepare_data()
grouped_rating = group_ratings_by_mean(player_attributes)
match_new = merge_ratings_with_matches(match,grouped_rating)
home_goals = get_goals(match_new,True)
away_goals = get_goals(match_new,False)
match_with_goals_scored = get_goal_difference(home_goals,away_goals,match_new)
home_results,away_results = get_results(match_new)
match_with_wins = get_wins(home_results,away_results,match_with_goals_scored)
match_with_label = get_match_label(match_with_wins)
book_keeper = ["B365H", "B365D", "B365A", "BWH", "BWD", "BWA", "IWH", 
                "IWD", "IWA", "LBH", "LBD", "LBA", "PSH", "PSD", "PSA", "WHH", "WHD", "WHA", "SJH", "SJD", 
                "SJA", "VCH", "VCD", "VCA", "GBH", "GBD", "GBA", "BSH", "BSD", "BSA"]
match_with_probs = betting_probabilities(match_with_label,book_keeper)


Generated matches and player attributes in 2.439833 secs
Grouped ratings in 0.082509 secs
Added ratings to matches in 1.990269 secs
Got home goals in 0.037837 secs
Got away goals in 0.035738 secs
Got goal difference in 0.363362 secs
Got match results in 0.092652 secs
Got wins in 0.5965 secs
Got match labels in 0.08315 secs
Got betting probabilities in 1.072913 secs


In [19]:
# logistic regression
train,test = get_train_test_data(match_with_probs,0.7,0.3,book_keeper)
lr = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=0.1,family="multinomial")
lr_model = lr.fit(train)
result = lr_model.evaluate(test)
multi_class = MulticlassClassificationEvaluator()
multi_class.evaluate(result.predictions)

Generated training and testing data in 2.585834 secs


0.4276271795453492

In [23]:
#decision tree
train,test = get_train_test_data(match_with_probs,0.7,0.3,book_keeper)
for i in range(1,10):
    dtc = DecisionTreeClassifier(maxDepth=5,minInstancesPerNode=i)
    dtc_model = dtc.fit(train)
    preds = dtc_model.transform(test)
    evaluator = MulticlassClassificationEvaluator(metricName = "accuracy")
    print(i, evaluator.evaluate(preds))

Generated training and testing data in 2.056386 secs
1 0.4822190611664296
2 0.4822190611664296
3 0.4793741109530583


KeyboardInterrupt: 

In [16]:
# random forest
train,test = get_train_test_data(match_with_probs,0.7,0.3,book_keeper)
rfc = RandomForestClassifier(numTrees=20)
rfc_model = rfc.fit(train)
preds = rfc_model.transform(test)
evaluator = MulticlassClassificationEvaluator(metricName = "accuracy")
evaluator.evaluate(preds)

Generated training and testing data in 2.328812 secs


0.5416666666666666

In [17]:
# naive bayes
train,test = get_train_test_data(match_with_probs,0.7,0.3,book_keeper)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures",)
scalerModel = scaler.fit(train)
scaledTrain = scalerModel.transform(train)
scaledTest = scalerModel.transform(test)
nb = NaiveBayes(smoothing=0.5,featuresCol="scaledFeatures",labelCol="label")
nb_model = nb.fit(scaledTrain)
predictions = nb_model.transform(scaledTest)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

Generated training and testing data in 2.563837 secs


0.5545212765957447

In [57]:
match_new.alias("mn").join(player_attributes.alias("pa"), (player_attributes["player_api_id"] == match_new["home_player_3"])
                          & (player_attributes["date"]<match_new["date"]))\
.select("match_api_id","home_team_api_id","overall_rating","pa.date","mn.date","home_player_3")\
.groupBy("mn.date","home_player_3").mean().filter(match_new["home_player_3"] == 19327).orderBy("mn.date").show()

# match_new.select("home_player_3").groupBy("home_player_3").count().orderBy("count",ascending=False).show()

+-------------------+-------------+-----------------+---------------------+-------------------+------------------+
|               date|home_player_3|avg(match_api_id)|avg(home_team_api_id)|avg(overall_rating)|avg(home_player_3)|
+-------------------+-------------+-----------------+---------------------+-------------------+------------------+
|2012-08-27 00:00:00|        19327|        1259972.0|               9906.0|  76.55555555555556|           19327.0|
|2012-09-16 00:00:00|        19327|        1260008.0|               9906.0|               76.7|           19327.0|
|2012-09-23 00:00:00|        19327|        1260028.0|               9906.0|               76.7|           19327.0|
|2012-10-07 00:00:00|        19327|        1260069.0|               9906.0|               76.7|           19327.0|
|2012-10-28 00:00:00|        19327|        1260108.0|               9906.0|               76.7|           19327.0|
|2012-11-11 00:00:00|        19327|        1260172.0|               9906.0|     