In [1]:
from ipynb.fs.full.data_extraction import init_spark, read_file
from pyspark.sql.functions import count,when,isnan,dayofmonth, month, year,col,udf,struct,lit,unix_timestamp
from pyspark.sql.types import IntegerType,LongType,TimestampType
from datetime import datetime
from pyspark.ml.feature import StringIndexer, VectorAssembler,IndexToString,VectorIndexer,MinMaxScaler
from pyspark.ml.classification import (LogisticRegression, RandomForestClassifier, DecisionTreeClassifier,
                                        MultilayerPerceptronClassifier,NaiveBayes)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from handyspark import *

DataFrame[match_id: string, home_team_id: int, home_team: string, home_team_goal: int, league_id: int, B365H: double, B365D: double, B365A: double, away_team_id: int, away_team: string, away_team_goal: int, winner: string, country: string]


In [2]:
spark = init_spark()

In [3]:
def prepare_data():
    start = datetime.now()
    match = read_file("Match.csv")
    player_attributes = read_file("Player_Attributes.csv")
    players = read_file("Player.csv")
    required_columns = ["match_api_id","home_team_api_id","away_team_api_id","date","home_team_goal","away_team_goal","home_player_1", "home_player_2", "home_player_3", "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11","B365H", "B365D", "B365A", "BWH", "BWD", "BWA"]
    match = match.select(required_columns)
    match = match.na.drop()
    player_attributes = player_attributes.na.drop()
    end = datetime.now()
    print("Generated matches and player attributes in "+str((end-start).total_seconds())+" secs")
    return match,player_attributes

In [4]:
match,player_attributes = prepare_data()
match = match.orderBy("date")

Generated matches and player attributes in 2.466443 secs


In [5]:
def merge_ratings_with_matches(match,player_attributes):
    start=datetime.now()
    all_players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11"]
    match_new = match
#     match.cache()
    for player in all_players:
        match_df = match.select("match_api_id",player,"date")
        joined_df = match_df.join(player_attributes, (match_df[player] == player_attributes["player_api_id"])
                         & (player_attributes["date"]<match_df["date"]))\
            .select("match_api_id",player,match_df["date"],player_attributes["date"],"overall_rating")\
            .drop(player_attributes["date"])
        grouped_df = joined_df.groupBy(match_df["date"],player).mean().select(match_df["date"],player,"avg(overall_rating)")\
            .withColumnRenamed("avg(overall_rating)",player+"_mean_rating")
        temp_df = grouped_df.join(joined_df,(grouped_df["date"]==joined_df["date"]) & (grouped_df[player] == joined_df[player]))\
            .select(grouped_df[player],player+"_mean_rating","match_api_id",joined_df["date"]).distinct()
        match_new = temp_df.join(match_new, on ="match_api_id").drop(temp_df[player]).drop(temp_df["date"])
#         print(match_new.columns)
    end=datetime.now()
    print("Added ratings to matches in "+str((end-start).total_seconds())+" secs")
    return match_new


In [6]:
match_new = merge_ratings_with_matches(match,player_attributes)

Added ratings to matches in 1.608172 secs


DataFrame[match_api_id: int, away_player_11_mean_rating: double, away_player_10_mean_rating: double, away_player_9_mean_rating: double, away_player_8_mean_rating: double, away_player_7_mean_rating: double, away_player_6_mean_rating: double, away_player_5_mean_rating: double, away_player_4_mean_rating: double, away_player_3_mean_rating: double, away_player_2_mean_rating: double, away_player_1_mean_rating: double, home_player_11_mean_rating: double, home_player_10_mean_rating: double, home_player_9_mean_rating: double, home_player_8_mean_rating: double, home_player_7_mean_rating: double, home_player_6_mean_rating: double, home_player_5_mean_rating: double, home_player_4_mean_rating: double, home_player_3_mean_rating: double, home_player_2_mean_rating: double, home_player_1_mean_rating: double, home_team_api_id: int, away_team_api_id: int, date: timestamp, home_team_goal: int, away_team_goal: int, home_player_1: int, home_player_2: int, home_player_3: int, home_player_4: int, home_player_

In [7]:
def get_goals(match_new,home=True):
    if home:
        home_goals = match_new.select("date","home_team_api_id","home_team_goal","away_team_goal")\
                    .withColumnRenamed("date","home_date")\
                    .withColumnRenamed("home_team_api_id","team_id")
        cj = home_goals.crossJoin(match_new)
        home_goals_joined = cj.filter(cj["team_id"] == cj["home_team_api_id"])\
            .filter(cj["home_date"]<=cj["date"])\
            .select(cj["match_api_id"],"home_date","date","team_id","away_team_api_id",match_new["home_team_goal"],match_new["away_team_goal"])\
            .groupBy("date","team_id").sum()\
            .select("date","team_id","sum(home_team_goal)","sum(away_team_goal)")\
        .withColumnRenamed("sum(home_team_goal)","ht_goals_scored")\
        .withColumnRenamed("sum(away_team_goal)","ht_goals_conceded")
        return home_goals_joined
    else:
        away_goals = match_new.select("date","away_team_api_id","home_team_goal","away_team_goal")\
                    .withColumnRenamed("date","away_date")\
                    .withColumnRenamed("away_team_api_id","team_id")
        cj = away_goals.crossJoin(match_new)
        away_goals_joined = cj.filter(cj["team_id"] == cj["away_team_api_id"])\
            .filter(cj["away_date"]<=cj["date"])\
            .select(cj["match_api_id"],"away_date","date","team_id","away_team_api_id",match_new["home_team_goal"],match_new["away_team_goal"])\
            .groupBy("date","team_id").sum()\
            .select("date","team_id","sum(home_team_goal)","sum(away_team_goal)")\
        .withColumnRenamed("sum(home_team_goal)","at_goals_conceded")\
        .withColumnRenamed("sum(away_team_goal)","at_goals_scored")
        return away_goals_joined




In [8]:
home_goals = get_goals(match_new,True)\
            .withColumnRenamed("home_team_api_id","team_id")\
            .withColumnRenamed("date","home_date")

away_goals = get_goals(match_new,False)\
            .withColumnRenamed("away_team_api_id","team_id")\
            .withColumnRenamed("date","away_date")


In [9]:
all_goals = home_goals.join(away_goals, on = "team_id")
home_goals.columns

['home_date', 'team_id', 'ht_goals_scored', 'ht_goals_conceded']

In [10]:
match_with_goals = home_goals.join(match_new, (home_goals["team_id"] == match_new["home_team_api_id"])
                       & (home_goals["home_date"] == match_new["date"]))\
                        .drop("home_date","team_id")


match_with_goals = away_goals.join(match_with_goals,(away_goals["team_id"] == match_new["away_team_api_id"])
                       & (away_goals["away_date"] == match_new["date"]))\
                         .drop("away_date","team_id")


In [11]:
goal_difference = match_with_goals.withColumn("ht_goal_diff",match_with_goals["ht_goals_scored"]
                                              -match_with_goals["ht_goals_conceded"])\
                                    .withColumn("at_goal_diff",match_with_goals["at_goals_scored"]
                                                -match_with_goals["at_goals_conceded"])\
                                    .drop("ht_goals_scored","ht_goals_conceded","at_goals_conceded","at_goals_scored")

In [12]:
match_with_label = goal_difference.withColumn("match_label", when(goal_difference["home_team_goal"]>goal_difference["away_team_goal"],"win")
                                             .otherwise(when(goal_difference["home_team_goal"]<goal_difference["away_team_goal"],"loss")
                                                       .otherwise("draw")))
print("done")

done


In [13]:
home_wins = match_with_label.select("date","home_team_api_id","match_label")\
                            .filter(match_with_label["match_label"] == "win")\
                            .withColumnRenamed("home_team_api_id","team_id")\
                            .withColumnRenamed("date","home_date")\
                            .withColumnRenamed("match_label","home_match_label")

cj_home = home_wins.crossJoin(match_with_label)
home_wins_joined = cj_home.filter((cj_home["team_id"] == cj_home["home_team_api_id"])
         & (cj_home["home_date"]<= cj_home["date"]))\
.select("home_team_api_id","home_date","date",cj_home["match_label"])\
.groupBy("date","home_team_api_id").count()\
# .withColumnRenamed("home_team_api_id","team_id")


away_wins = match_with_label.select("date","away_team_api_id","match_label")\
                            .filter(match_with_label["match_label"] == "win")\
                            .withColumnRenamed("away_team_api_id","team_id")\
                            .withColumnRenamed("date","away_date")\
                            .withColumnRenamed("match_label","away_match_label")

cj_away = away_wins.crossJoin(match_with_label)
away_wins_joined = cj_away.filter((cj_away["team_id"] == cj_away["away_team_api_id"])
         & (cj_away["away_date"]<= cj_away["date"]))\
.select("away_team_api_id","away_date","date",cj_away["match_label"])\
.groupBy("date","away_team_api_id").count()\
# .withColumnRenamed("away_team_api_id","team_id")
print("done")

done


In [14]:
match_with_wins = home_wins_joined.join(match_with_label,["date","home_team_api_id"],how="full")\
                        .withColumnRenamed("count","home_count").na.fill(0,"home_count")
match_with_wins = match_with_wins.join(away_wins_joined,["date","away_team_api_id"],how="full")\
                        .withColumnRenamed("count","away_count").na.fill(0,"away_count")
print("done")

done


In [15]:
book_keeper = ["B365H", "B365D", "B365A", "BWH", "BWD","BWA"]
match_with_probs = match_with_wins
for keeper in book_keeper:
    match_with_probs = match_with_probs.withColumn(keeper,1/match_with_probs[keeper])

print("done")

done


In [16]:
match_with_probs = match_with_probs.drop("home_team_goal","away_team_goal")
print("done")

done


In [18]:
indexer = StringIndexer(inputCol="match_label",outputCol= "label")
match_data = indexer.fit(match_with_probs).transform(match_with_probs)
features = [x for x in match_data.columns if x not in 
            ["home_team_goal","away_team_goal","date","label","match_label","match_api_id"]]

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:39597)
Traceback (most recent call last):
  File "/home/sujay/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/sujay/.local/lib/python3.6/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:39597)

In [None]:
assembler = VectorAssembler(inputCols = features, outputCol = "features")
match_final = assembler.transform(match_data)
final_data = match_final.select("features","label")

In [None]:
train,test = final_data.randomSplit([0.7,0.3],seed=1234)
dtc = DecisionTreeClassifier()
dtc_model = dtc.fit(train)
preds = dtc_model.transform(test)
evaluator = MulticlassClassificationEvaluator(metricName = "accuracy")
evaluator.evaluate(preds)

In [None]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures",)
scalerModel = scaler.fit(train)
scaledTrain = scalerModel.transform(train)
scaledTest = scalerModel.transform(test)
nb = NaiveBayes(smoothing=0.5,featuresCol="scaledFeatures",labelCol="label")
nb_model = nb.fit(scaledTrain)
predictions = nb_model.transform(scaledTest)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy

In [None]:
layers = [3, 4, 3]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
nn_model = trainer.fit(train)
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
evaluator.evaluate(predictionAndLabels)