In [16]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when
from pyspark.sql import SparkSession

In [17]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [18]:
def read_file(filename):
    spark = init_spark()
    data = spark.read.csv("data/" + filename, header=True,inferSchema = True)
    return data

In [19]:
match_table = read_file("Match.csv")
match_table.printSchema()

root
 |-- id: string (nullable = true)
 |-- country_id: string (nullable = true)
 |-- league_id: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- stage: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- match_api_id: integer (nullable = true)
 |-- home_team_api_id: integer (nullable = true)
 |-- away_team_api_id: integer (nullable = true)
 |-- home_team_goal: integer (nullable = true)
 |-- away_team_goal: integer (nullable = true)
 |-- home_player_X1: integer (nullable = true)
 |-- home_player_X2: integer (nullable = true)
 |-- home_player_X3: integer (nullable = true)
 |-- home_player_X4: integer (nullable = true)
 |-- home_player_X5: integer (nullable = true)
 |-- home_player_X6: integer (nullable = true)
 |-- home_player_X7: integer (nullable = true)
 |-- home_player_X8: integer (nullable = true)
 |-- home_player_X9: integer (nullable = true)
 |-- home_player_X10: integer (nullable = true)
 |-- home_player_X11: integer (nullable = true)
 |-- a

In [20]:
match_table = match_table.select(["match_api_id","date",'home_team_api_id', 'away_team_api_id',"home_team_goal","away_team_goal","home_player_1", "home_player_2", "home_player_3", "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11","B365H", "B365D", "B365A", "BWH", "BWD", "BWA",'BWH',
               'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH','PSD', 'PSA', 'WHH', 'WHD',
                'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD','VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA'])
match_table = match_table.na.drop()

match_table = match_table.withColumn("winner", when(match_table["home_team_goal"]>match_table["away_team_goal"], match_table["home_team_api_id"])
                             .otherwise(when(match_table["home_team_goal"]< match_table["away_team_goal"], match_table["away_team_api_id"])
                                       .otherwise("Draw")))

In [21]:
match_table.printSchema()

root
 |-- match_api_id: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- home_team_api_id: integer (nullable = true)
 |-- away_team_api_id: integer (nullable = true)
 |-- home_team_goal: integer (nullable = true)
 |-- away_team_goal: integer (nullable = true)
 |-- home_player_1: integer (nullable = true)
 |-- home_player_2: integer (nullable = true)
 |-- home_player_3: integer (nullable = true)
 |-- home_player_4: integer (nullable = true)
 |-- home_player_5: integer (nullable = true)
 |-- home_player_6: integer (nullable = true)
 |-- home_player_7: integer (nullable = true)
 |-- home_player_8: integer (nullable = true)
 |-- home_player_9: integer (nullable = true)
 |-- home_player_10: integer (nullable = true)
 |-- home_player_11: integer (nullable = true)
 |-- away_player_1: integer (nullable = true)
 |-- away_player_2: integer (nullable = true)
 |-- away_player_3: integer (nullable = true)
 |-- away_player_4: integer (nullable = true)
 |-- away_player_5: integer

In [23]:
betting_table = match_table.select("home_team_api_id","away_team_api_id","winner","B365H","B365D","B365A", 
                              'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH',
                              'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD',
                              'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA')
betting_table = betting_table.withColumn("predictionB365", when(betting_table["B365H"]<betting_table["B365D"],
                                                           when(betting_table["B365H"]<betting_table["B365A"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["B365D"]< betting_table["B365A"],"Draw").otherwise(betting_table["away_team_api_id"])))
# betting_table.take(10)

betting_table = betting_table.withColumn("accuracyB365", when(betting_table["winner"]==betting_table["predictionB365"], 1).otherwise(0))
bet = betting_table.select("accuracyB365")
bet.groupBy('accuracyB365').count().show()



+------------+-----+
|accuracyB365|count|
+------------+-----+
|           1| 1353|
|           0| 1274|
+------------+-----+



In [27]:
betting_table = betting_table.withColumn("predictionBW", when(betting_table["BWH"]<betting_table["BWD"],
                                                           when(betting_table["BWH"]<betting_table["BWA"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["BWD"]< betting_table["BWA"],"Draw").otherwise(betting_table["away_team_api_id"])))
betting_table = betting_table.withColumn("accuracyBW", when(betting_table["winner"]==betting_table["predictionBW"], 1).otherwise(0))
bet = (betting_table.select("accuracyBW"))
bet.groupBy('accuracyBW').count().show()

+----------+-----+
|accuracyBW|count|
+----------+-----+
|         1| 1350|
|         0| 1277|
+----------+-----+



In [28]:
betting_table = betting_table.withColumn("predictionIW", when(betting_table["IWH"]<betting_table["IWD"],
                                                           when(betting_table["IWH"]<betting_table["IWA"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["IWD"]< betting_table["IWA"],"Draw").otherwise(betting_table["away_team_api_id"])))
betting_table = betting_table.withColumn("accuracyIW", when(betting_table["winner"]==betting_table["predictionIW"], 1).otherwise(0))
bet = (betting_table.select("accuracyIW"))
bet.groupBy('accuracyIW').count().show()

+----------+-----+
|accuracyIW|count|
+----------+-----+
|         1| 1352|
|         0| 1275|
+----------+-----+



In [29]:
betting_table = betting_table.withColumn("predictionWH", when(betting_table["WHH"]<betting_table["WHD"],
                                                           when(betting_table["WHH"]<betting_table["WHA"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["WHD"]< betting_table["WHA"],"Draw").otherwise(betting_table["away_team_api_id"])))
betting_table = betting_table.withColumn("accuracyWH", when(betting_table["winner"]==betting_table["predictionWH"], 1).otherwise(0))
bet = (betting_table.select("accuracyWH"))
bet.groupBy('accuracyWH').count().show()

+----------+-----+
|accuracyWH|count|
+----------+-----+
|         1| 1356|
|         0| 1271|
+----------+-----+



In [30]:
betting_table = betting_table.withColumn("predictionSJ", when(betting_table["SJH"]<betting_table["SJD"],
                                                           when(betting_table["SJH"]<betting_table["SJA"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["SJD"]< betting_table["SJA"],"Draw").otherwise(betting_table["away_team_api_id"])))
betting_table = betting_table.withColumn("accuracySJ", when(betting_table["winner"]==betting_table["predictionSJ"], 1).otherwise(0))
bet = (betting_table.select("accuracySJ"))
bet.groupBy('accuracySJ').count().show()


+----------+-----+
|accuracySJ|count|
+----------+-----+
|         1| 1351|
|         0| 1276|
+----------+-----+



In [37]:
betting_table = betting_table.withColumn("predictionBS", when(betting_table["BSH"]<betting_table["BSD"],
                                                           when(betting_table["BSH"]<betting_table["BSA"],betting_table["home_team_api_id"]).otherwise(betting_table["away_team_api_id"])
                                                           ).otherwise(when(betting_table["BSD"]< betting_table["BSA"],"Draw").otherwise(betting_table["away_team_api_id"])))
betting_table = betting_table.withColumn("accuracyBS", when(betting_table["winner"]==betting_table["predictionBS"], 1).otherwise(0))
bet = (betting_table.select("accuracyBS"))
bet.groupBy('accuracyBS').count().show()

+----------+-----+
|accuracyBS|count|
+----------+-----+
|         1| 1352|
|         0| 1275|
+----------+-----+



In [38]:
pandas_df = new_df.toPandas()
import matplotlib.pyplot as plt
pandas_df.plot().show()



+------------+----------+----------+----------+----------+-----+
|accuracyB365|accuracyBW|accuracyIW|accuracySJ|accuracyBS|count|
+------------+----------+----------+----------+----------+-----+
|           0|         0|         0|         0|         0| 1241|
|           0|         1|         1|         0|         0|    1|
|           0|         1|         1|         1|         1|    3|
|           1|         0|         1|         1|         0|    4|
|           1|         1|         0|         1|         1|   12|
|           0|         1|         0|         0|         0|    3|
|           1|         0|         0|         0|         1|    2|
|           0|         0|         1|         0|         0|   14|
|           1|         1|         1|         0|         1|    1|
|           1|         1|         0|         1|         0|    6|
|           1|         1|         0|         0|         1|    4|
|           0|         0|         1|         1|         1|    1|
|           0|         1|

In [2]:
print('test')

test
