In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

In [2]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.sql.types import *

In [3]:
#  0 game num
#  1 date
#  2 time
#  3 tv coverage
#  4 'Box Score'
#  5 home/away
#  6 opponent
#  7 W/L
#  8 OT
#  9 score1
# 10 score2
# 11 cumulative wins (afer game)
# 12 cumulative losses (after game)
# 13 streak
# 14 empty 
def to_24_hour(time):
    if(len(time) == 0):
        return ""
    pieces = time.split(':')
    numeric_time = int(pieces[0]+pieces[1][:2])
    if ('p' in time):
        numeric_time += 1200
        return numeric_time
    elif ('a' in time):
        return numeric_time
    
def clean_games(line):
    data = line.split(',')
    
    data[1] = data[1].split()
    data[1][2] = int(data[1][2])
    data[1][3] = int(data[1][3])

    if (data[2] != ''):
        data[2] = to_24_hour(data[2].split()[0])
        
    if(data[5] == '@'): 
        data[5] = 'away'
    else: 
        data[5] ='home'
    
    data[11] = int(data[11])
    data[12] = int(data[12])
    if (data[7] == 'W'):
        if (data[9] < data[10]):
            data[9], data[10] = data[10], data[9]
        data[11] = data[11]-1
    elif (data[7] == 'L'):
        if (data[9] > data[10]):
            data[9], data[10] = data[10], data[9]
        data[12] = data[12]-1
    
    toReturn = data[1] + [data[2]] + data[5:7] + data[11:13] + [data[7]]
    return toReturn

#  0 day_of_week
#  1 month
#  2 day_of_month
#  3 year
#  4 time
#  5 home/away
#  6 opponent
#  7 cumulative wins (before game)
#  8 cumulative losses (before game)
#  9 W/L


all_games_RDD = sc.textFile("./data/train")\
    .filter(lambda line : line[0] != 'G')\
    .map(clean_games)\
    .filter(lambda data : data[4] != '')

playoffs_2016_RDD = sc.textFile("./data/test")\
    .filter(lambda line : line[0] != 'G')\
    .map(clean_games)\
    .filter(lambda data : data[4] != '')

In [4]:
all_games_RDD.cache()

PythonRDD[4] at RDD at PythonRDD.scala:43

In [5]:
playoffs_2016_RDD.collect()[0]

['Sat', 'Apr', 16, 2016, 1530, 'home', 'Houston Rockets', 73, 9, 'W']

In [6]:
data_strings = 'day_of_week month day_of_month year time location opponent season_wins season_losses outcome'
def map_type(data):
    to_return = []
    for d in data:
        if (type(d) == str):
            to_return.append(StringType())
        elif (type(d) == int):
            to_return.append(IntegerType())
    return to_return

types = map_type(all_games_RDD.collect()[0])

struct_fields = []
for title, sql_type in zip(data_strings.split(), types):
    struct_fields.append(StructField(title,sql_type,True))
schema = StructType(struct_fields)   

In [7]:
train_df = sqlContext.createDataFrame(all_games_RDD, schema)
test_df = sqlContext.createDataFrame(playoffs_2016_RDD, schema)
test_df.cache()
train_df.show(5)

+-----------+-----+------------+----+----+--------+--------------------+-----------+-------------+-------+
|day_of_week|month|day_of_month|year|time|location|            opponent|season_wins|season_losses|outcome|
+-----------+-----+------------+----+----+--------+--------------------+-----------+-------------+-------+
|        Wed|  Oct|          28|2009|1930|    home|     Houston Rockets|          0|            0|      L|
|        Fri|  Oct|          30|2009|1900|    away|        Phoenix Suns|          0|            1|      L|
|        Wed|  Nov|           4|2009|1930|    home|   Memphis Grizzlies|          0|            2|      W|
|        Fri|  Nov|           6|2009|1930|    home|Los Angeles Clippers|          1|            2|      L|
|        Sun|  Nov|           8|2009|1800|    away|    Sacramento Kings|          1|            3|      L|
+-----------+-----+------------+----+----+--------+--------------------+-----------+-------------+-------+
only showing top 5 rows



In [8]:
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler

day_of_week_indexer = StringIndexer(inputCol = 'day_of_week', outputCol = 'day_of_week_indexed')
month_indexer = StringIndexer(inputCol = 'month', outputCol = 'month_indexed')
location_indexer = StringIndexer(inputCol = 'location', outputCol = 'location_indexed')
opponent_indexer = StringIndexer(inputCol = 'opponent', outputCol = 'opponent_indexed')
outcome_indexer = StringIndexer(inputCol = 'outcome', outputCol = 'outcome_indexed')

string_cols = ['day_of_week_indexed', 'month_indexed', 'location_indexed', 'opponent_indexed']
numeric_cols = ['day_of_month', 'year', 'time', 'season_wins', 'season_losses']

assembler = VectorAssembler(
    inputCols =  string_cols + numeric_cols,
    outputCol = 'features')

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier

In [10]:
classifier = DecisionTreeClassifier(labelCol = 'outcome_indexed', featuresCol = 'features')

pipeline = Pipeline(stages=[day_of_week_indexer, 
                            month_indexer, 
                            location_indexer,
                            opponent_indexer,
                            outcome_indexer,
                            assembler, 
                            classifier])
model = pipeline.fit(train_df)

In [11]:
predictions = (model.transform(test_df))


In [12]:
#predictions.select("prediction", "outcome_indexed").toPandas()

In [13]:
# IndexToString is expirimental
# converter = IndexToString(inputCol="prediction", outputCol="predicted_val", labels=outcome_indexer)
# converted = converter.transform(predictions)
# predictions.select("predicted_val", "outcome").toPandas()

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="outcome_indexed", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

Test Error = 0.375 


In [15]:
from pyspark.ml.classification import *
decision_tree = DecisionTreeClassifier(labelCol = 'outcome_indexed')
logistic_regression = LogisticRegression(labelCol="outcome_indexed")
random_forest = RandomForestClassifier(labelCol="outcome_indexed")
gradient_boosted_tree = GBTClassifier(labelCol="outcome_indexed")
layers = [9,6,5,4,3,2]
multilayer_perceptron = MultilayerPerceptronClassifier(labelCol="outcome_indexed", layers=layers)

classifiers = [decision_tree, logistic_regression, random_forest, gradient_boosted_tree,
               multilayer_perceptron]
names = ['decision_tree', 'logistic_regression', 'random_forest','gradient_boosted_tree',
               'multilayer_perceptron']
output = ""
for classifier, name in zip(classifiers, names):
    pipeline = Pipeline(stages=[day_of_week_indexer, 
                            month_indexer, 
                            location_indexer,
                            opponent_indexer,
                            outcome_indexer,
                            assembler, 
                            classifier])
    model = pipeline.fit(train_df)
    prediction = (model.transform(test_df))
    accuracy = evaluator.evaluate(prediction)
    output += name + ": Test Error = %g\n" % (1.0 - accuracy)
print(output)



decision_tree: Test Error = 0.375
logistic_regression: Test Error = 0.375
random_forest: Test Error = 0.375
gradient_boosted_tree: Test Error = 0.458333
multilayer_perceptron: Test Error = 0.375

