In [18]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as skl_preprocessing
from statistics import mode

In [8]:
def preprocess_data(data):
    relevant_data = data[["release_speed", "release_pos_x", "release_pos_z", "release_pos_y", "release_spin_rate", "vx0", "vy0", "vz0", "ax", "ay", "az", "pfx_x", "pfx_z", "spin_axis", "pitch_number", "zone", "p_throws", "balls", "strikes", "pitch_type"]]
    
    relevant_data = relevant_data.dropna()
    relevant_data['p_throws'] = (relevant_data['p_throws'] == 'R').astype(int)
    
    return relevant_data

In [9]:
def score(predictions, actual):
    """
    Returns the f1 score, precision, and recall of a prediction
    
    predictions: list of predictions
    actual: list of actual outcomes
    """
    
    tp, tn, fp, fn = 0, 0, 0, 0
    for prediction, actual in zip(predictions, actual):
        if prediction == actual:
            if prediction == 1:
                tp += 1
            else:
                tn += 1
        else:
            if prediction == 1:
                fp += 1
            else:
                fn += 1
                
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    f1_score = 2 * precision * recall / (precision + recall)
    
    return f1_score, precision, recall

In [10]:
def k_fold_validation(k, model, data):
    scores = {}
    
    for i in range(k):
        test_data = data.iloc[i * len(data) // k: (i + 1) * len(data) // k]
        train_data = data.iloc[0:i * len(data) // k] + data.iloc[(i + 1) * len(data) // k:]
        
        model.train(train_data.drop(['pitch_type'], axis=1))
        predictions = model.predict(test_data.drop(['pitch_type'], axis=1))
        
        score = score(predictions, test_data['pitch_type'])
        
        scores[k] = score
        
    return scores

In [11]:
class BaseModel():
    def __init__():
        pass
    
    def train(self, data):
        """
        Train the model with the given input data
        """
        pass
        
    def predict(self, data):
        """
        Generate a list of predictions, one for each row in the input dataFrame
        """
        pass

In [16]:
class Tree():
    data = None
    col = ''
    condition = None
    majority = None
    entropy = 0.0
    depth = 0
    children = None
    
    def __init__(self, data, col, condition, majority, entropy, depth, children):
        self.data = data
        self.col = col
        this.condition = condition
        self.entropy = entropy
        self.depth = depth
        self.children = children
        
    def predict(self, data):
        predictions = []
        for i, row in data.iterrows():
            if self.children == None:
                return self.majority
            
            for child in self.children:
                if data[child.col] <= child.condition:
                    return child.predict(data)

In [23]:
class RandomForest(BaseModel):
    """
    IMPLEMENT HERE
    """
    
    c = 0
    max_depth = 0
    feature_reduction_percent = 0
    num_buckets = 0
    
    classifers = []
    classes = []
    
    def __init__(self, classes, c=5, max_depth=10, feature_reduction_percent=20, num_buckets=10):
        self.c = c
        self.max_depth = max_depth
        self.feature_reduction_percent = feature_reduction_percent
        self.num_buckets = num_buckets
        
        self.classes = classes
    
    def train(self, data):
        sample_size = (len(data) / self.c) * 2
        
        for i in range(self.c):
            sample_data = data.sample(n=sample_size)
            
            root = Tree(sample_data, '', None, None, 0.0, 0, None)
            tree = self.build_tree(root)
            self.classifers.append(tree)
    
    def build_tree(self, tree):
        data = tree.data
        if (len(set(data['pitch_type'])) == 1 or tree.depth == self.max_depth):
            return tree
        
        possible_features = data.iloc[:, :-1]
        num_to_keep = int(len(possible_features) * ((100 - self.feature_reduction_percent) / 100))
        features = random.choices(list(possible_features), k=num_to_keep)
        
        splitting_feature, entropy = self.optimal_split(data, features)
        depth = tree.depth + 1

        chilren = []
        if (len(pd.unique(data[splitting_feature])) < self.num_buckets):
            buckets = list(pd.unique(data[splitting_feature]))
        else:
            minimum = data[splitting_feature].min()
            maximum = data[splitting_feature].max()
            
            buckets = list(range(minimum, maximum + 1, (maximum - minimum) / self.num_buckets))
            
        for bucket in buckets:
            bucket_data = data[data[splitting_feature] <= bucket]
            majority = bucket_data['pitch_type'].mode()[0]
            
            bucket_tree = Tree(bucket_data, splitting_feature, bucket, majority, entropy, depth, None)
            bucket_tree = self.build_tree(bucket_tree)
            chilren.append(bucket_tree)
            
        tree.children = children
        return tree

    def optimal_split(data, features):
        entropies = [self.entropy(list(train[col])) for col in features]
        return train.columns[np.argmax(entropies)], max(entropies)
    
    def entropy(data):
        entropy = 0
        probabilities = []
        for item in set(data):
            prob = data.count(item) / len(data)
            probabilities.append(prob)
        for p in probabilities:
            entropy += p * log2(p)
        entropy = -entropy
        return entropy
    
    def predict(self, data):
        predictions = []
        for i in range(self.c):
            predictions.append(self.classifers[i].predict(data))
            
        return mode(predictions)

In [13]:
raw_input_data = pd.read_csv('data/Statcast_2021.csv')
df = preprocess_data(raw_input_data)
df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,release_pos_y,release_spin_rate,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,spin_axis,pitch_number,zone,p_throws,balls,strikes,pitch_type
0,92.3,1.4,6.8,54.03,2330.0,-6.833043,-134.166485,-7.361843,9.708393,26.562803,-14.083224,0.69,1.38,148.0,4,1.0,0,1,2,FF
1,80.6,1.6,6.64,54.15,2254.0,-3.700232,-117.430885,-3.266842,-6.531123,19.79339,-27.369114,-0.77,0.48,315.0,3,4.0,0,1,1,SL
2,75.5,1.46,6.88,54.34,1940.0,-1.977183,-109.901781,-1.155694,-4.872924,20.602334,-36.262184,-0.65,-0.51,328.0,2,5.0,0,1,0,CU
3,75.0,1.53,6.83,54.61,2017.0,2.37583,-109.20583,2.277617,-5.902656,19.427562,-38.284747,-0.69,-0.69,330.0,1,12.0,0,0,0,CU
4,91.2,1.49,6.66,54.15,2281.0,-5.868477,-132.500539,-6.486796,8.700586,30.11769,-15.941174,0.63,1.28,143.0,2,4.0,0,1,0,FF


In [24]:
model = RandomForest(list(pd.unique(df['pitch_type'])), 5, 10, 20, 10)
model.train(df)

ValueError: Only integers accepted as `n` values