## Imports

In [1]:
# imports
import math
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics import confusion_matrix

## Functions

In [2]:
def partition(a):
    return {c: (a==c).nonzero()[0] for c in np.unique(a)}

In [3]:
def entropy(s):
    
    # start with zero
    result = 0
    
    # get unique values and counts of set
    val, counts = np.unique(s, return_counts=True)
    freqs = counts/len(s)
    
    for p in freqs:
        result -= p * np.log2(p)
    
    return result

In [4]:
def information_gain(y, x):

    # start with unconditional entroy of class Y --> H(Y)
    result = entropy(y)

    # calculate conditional entropy of class Y given feature X_i
    val, counts = np.unique(x, return_counts=True)
    freqs = counts/len(x)

    # 'sum up' weighted average of conditional entropies for X_i taking values x_i --> H(Y|X_i)
    for p, v in zip(freqs, val):
        result -= p * entropy(y[x == v])

    return result

In [5]:
def split_information(y, x):
    
    # start with zero
    result = 0
    
    val, counts = np.unique(x, return_counts=True)
    freqs = counts/len(x)
    
    # 'sum up' 
    for freq in freqs:
        result -= freq*np.log2(freq)
        
    return result

In [6]:
def safe_divide(n, d):
    return n / d if d else 0

In [7]:
def is_pure(s):
    return len(set(s)) == 1

In [8]:
def build_tree(x, y, crit):
    
    # If there could be no split, just return the original set
    if is_pure(y) or len(y) == 0:
        return y

    # get attr that yields highest information gain
    gain = np.array([information_gain(y, x[col]) for col in x.columns.values])
    
    if crit=='ID3':
        selected_attr = np.argmax(gain)
    
    elif crit=='C4.5':
        split_info = np.array([split_information(y, x[col]) for col in x.columns.values])
        gain_ratio = [safe_divide(x,y) for x, y in zip(gain, split_info)]
        selected_attr = np.argmax(gain_ratio)
    
    else:
        return 'Invalid splitting criterion...'
    
    # return y if no gain
    if np.all(gain < 1e-6):
        return y

    # split data using the selected attribute
    sets = partition(x[x.columns.values[selected_attr]])

    # create dictionary to hold next subsets
    result = {}
    for k, v in sets.items():
        y_subset = y.take(v, axis=0)
        x_subset = x.take(v, axis=0)
        result["%s = %s" % (x.columns.values[selected_attr], k)] = build_tree(x_subset, y_subset, crit)

    return result

In [9]:
def evaluation_metrics(predictions):

    # confusion matrix
    conf_mat = confusion_matrix(test['Label'], predictions)
    print('Confusion Matrix:\n{}'.format(conf_mat))
    
    # compute metrics from tn, fp, fn, tp
    tn, fp, fn, tp = conf_mat.ravel()
    accuracy = (tp+tn)/(tp+fp+fn+tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(recall*precision)/(recall+precision)

    print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(accuracy, precision, recall, f1))

## Read in Data

In [10]:
train = pd.read_csv('./train.txt', sep='\t', index_col='ID').drop(['Opponent','Date'],axis=1)
test  = pd.read_csv('./test.txt', sep='\t', index_col='ID').drop(['Opponent','Date'],axis=1)
X = train.drop('Label', axis=1)
y = train.Label

## Decision Trees (ID3) - Building

In [20]:
tree_id3 = build_tree(X, y, 'ID3')
pprint(tree_id3)

{'Media = 1-NBC': {'Is_Opponent_in_AP25_Preseason = In': ID
3      Win
7      Win
19    Lose
23    Lose
Name: Label, dtype: object,
                   'Is_Opponent_in_AP25_Preseason = Out': {'Is_Home_or_Away = Away': ID
11    Win
Name: Label, dtype: object,
                                                           'Is_Home_or_Away = Home': ID
1      Win
4      Win
6      Win
10     Win
14     Win
15    Lose
16    Lose
20     Win
22     Win
Name: Label, dtype: object}},
 'Media = 2-ESPN': ID
17    Win
Name: Label, dtype: object,
 'Media = 3-FOX': ID
12    Lose
Name: Label, dtype: object,
 'Media = 4-ABC': {'Is_Opponent_in_AP25_Preseason = In': ID
5     Lose
24    Lose
Name: Label, dtype: object,
                   'Is_Opponent_in_AP25_Preseason = Out': ID
2      Win
8      Win
9      Win
13    Lose
18    Lose
Name: Label, dtype: object},
 'Media = 5-CBS': ID
21    Lose
Name: Label, dtype: object}


## Decision Trees (ID3) - Evaluation

In [12]:
# {
#     'Media = 1-NBC': {
#         'Is_Opponent_in_AP25_Preseason = In': 
#             Win (by root node majority)
#         'Is_Opponent_in_AP25_Preseason = Out': {
#             'Is_Home_or_Away = Away': 
#                 Win                                               
#             'Is_Home_or_Away = Home':
#                 Win (by node majority)
#         }
#     },
#     'Media = 2-ESPN': 
#         Win
#     'Media = 3-FOX': 
#         Lose
#     'Media = 4-ABC': {
#         'Is_Opponent_in_AP25_Preseason = In': 
#             Lose (node purity)
#         'Is_Opponent_in_AP25_Preseason = Out':
#             Win (by node majority)
#     },
#     'Media = 5-CBS':
#         Lose
# }

predictions_id3 = ['Win','Win','Win','Lose','Win','Win','Win','Win','Win','Lose','Win','Lose']
test['Predictions_id3'] = predictions_id3

In [13]:
evaluation_metrics(test['Predictions_id3'])

Confusion Matrix:
[[2 1]
 [1 8]]
Accuracy: 0.8333333333333334
Precision: 0.8888888888888888
Recall: 0.8888888888888888
F1 Score: 0.8888888888888888


## Decision Trees (C4.5) - Building

In [21]:
tree_c45 = build_tree(X, y, 'C4.5')
pprint(tree_c45)

{'Is_Opponent_in_AP25_Preseason = In': {'Is_Home_or_Away = Away': ID
5     Lose
12    Lose
24    Lose
Name: Label, dtype: object,
                                        'Is_Home_or_Away = Home': ID
3      Win
7      Win
19    Lose
23    Lose
Name: Label, dtype: object},
 'Is_Opponent_in_AP25_Preseason = Out': {'Media = 1-NBC': {'Is_Home_or_Away = Away': ID
11    Win
Name: Label, dtype: object,
                                                           'Is_Home_or_Away = Home': ID
1      Win
4      Win
6      Win
10     Win
14     Win
15    Lose
16    Lose
20     Win
22     Win
Name: Label, dtype: object},
                                         'Media = 2-ESPN': ID
17    Win
Name: Label, dtype: object,
                                         'Media = 4-ABC': ID
2      Win
8      Win
9      Win
13    Lose
18    Lose
Name: Label, dtype: object,
                                         'Media = 5-CBS': ID
21    Lose
Name: Label, dtype: object}}


In [15]:
# {
#     'Is_Opponent_in_AP25_Preseason = In': {
#         'Is_Home_or_Away = Away': 
#             Lose (node purity)
#         'Is_Home_or_Away = Home': 
#             Win (by root node majority)
#     },
#     'Is_Opponent_in_AP25_Preseason = Out': {
#         'Media = 1-NBC': {
#             'Is_Home_or_Away = Away': 
#                 Win                                               
#             'Is_Home_or_Away = Home': 
#                 Win (by node majority)
#         },

###       'Media = 3-FOX': 
###           Win (by root node majority)

#         'Media = 2-ESPN': 
#             Win
#         'Media = 4-ABC': 
#             Win (by node majority)
#         'Media = 5-CBS':
#             Lose
#     }
# }

predictions_c45 = ['Win','Win','Win','Win','Win','Win','Win','Win','Win','Lose','Win','Lose']
test['Predictions_c45'] = predictions_c45

In [16]:
evaluation_metrics(test['Predictions_c45'])

Confusion Matrix:
[[2 1]
 [0 9]]
Accuracy: 0.9166666666666666
Precision: 0.9
Recall: 1.0
F1 Score: 0.9473684210526316


## Naive-Bayes - Building & Evaluation

In [17]:
def naive_bayes(train, test_row):
    
    # split up dataset into winning and losing
    winning_set, losing_set = train[train['Label']=='Win'], train[train['Label']=='Lose']
    
    # for overall set, compute p(y=win)
    p_y = len(winning_set)/len(train)
    
    # for winning set, compute conditional probabilities
    conditional_probs = []
    for col in train.drop('Label', axis=1).columns:
        conditional_probs.append(len(winning_set[winning_set[col]==test_row[col]]) / len(winning_set))
    
    # compute p(y=win|x1,x2,x3) 
    win = np.prod(np.array(conditional_probs)) * p_y
    
    # for overall set, compute p(y=lose)
    p_not_y = len(losing_set)/len(train)

    # for winning set, compute conditional probabilities
    conditional_probs = []
    for col in train.drop('Label', axis=1).columns:
        conditional_probs.append(len(losing_set[losing_set[col]==test_row[col]]) / len(losing_set))
    
    # compute p(y=lose|x1,x2,x3)
    lose = np.prod(np.array(conditional_probs)) * p_not_y
         
    if win>=lose:
        return 'Win'
    else:
        return 'Lose'

In [18]:
predictions_nb = []
for i in range(0, len(test)):    
    row = test.iloc[i]
    x = naive_bayes(train, row)
    predictions_nb.append(x)

In [22]:
test['Predictions_nb'] = predictions_nb
predictions_nb

['Win',
 'Win',
 'Win',
 'Lose',
 'Win',
 'Lose',
 'Win',
 'Win',
 'Win',
 'Lose',
 'Win',
 'Lose']

In [19]:
evaluation_metrics(test['Predictions_nb'])

Confusion Matrix:
[[2 1]
 [2 7]]
Accuracy: 0.75
Precision: 0.875
Recall: 0.7777777777777778
F1 Score: 0.823529411764706
