In [3]:
from google.colab import files
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

uploaded = files.upload()
year1 = pd.read_csv(io.BytesIO(uploaded['year1.csv']))
year2 = pd.read_csv(io.BytesIO(uploaded['year2.csv']))
year3 = pd.read_csv(io.BytesIO(uploaded['year3.csv']))

Saving year1.csv to year1.csv
Saving year3.csv to year3.csv
Saving year2.csv to year2.csv


In [4]:
# Combine year1 and year2 data
merged = pd.concat([year1, year2], ignore_index=True)
merged.dropna(inplace=True)

# No forkballs in years 1 and 2, change them in year 3 to be splitters
year3.dropna(inplace=True)
year3['pitch_type'] = year3['pitch_type'].replace('FO', 'FS')


# Create 'swing' column based on conditions
swing_conditions = ['foul', 'hit_into_play', 'swinging_strike']
merged['swing'] = np.where(merged['description'].isin(swing_conditions), 1, 0)

# Define predictors for the model
predictors = ['release_speed', 'stand', 'p_throws', 'pitch_type', 'balls', 'strikes',
              'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'sz_top', 'sz_bot']

# Separate predictors and target variable
X = merged[predictors]
y = merged['swing']

# Define categorical columns for one-hot encoding
categorical_cols = ['stand', 'p_throws', 'pitch_type']

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Append logistic regression to the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the pipeline (preprocessing + logistic regression) on training data
pipeline.fit(X_train, y_train)

# Make predictions on test data
test_predictions = pipeline.predict(X_test)

# Make predictions on year3 data (validation set)
validation_predictions = pipeline.predict(year3)

# Append predicted probabilities to year3 dataframe
year3['SwingProbability'] = pipeline.predict_proba(year3)[:, 1]

# Save the dataframe with swing probabilities as CSV
year3.to_csv('year3_validation_with_probs.csv', index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Precision: 0.60
Recall: 0.51
F1 Score: 0.55


In [6]:
from sklearn.ensemble import RandomForestClassifier

# Clear probabilities
year3.drop('SwingProbability', axis=1, inplace=True)

# Append Random Forest Classifier to the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])  # You can adjust parameters here

# Split data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the pipeline (preprocessing + Random Forest) on training data
pipeline.fit(X_train, y_train)

# Make predictions on test data
test_predictions = pipeline.predict(X_test)

# Make predictions on year3 data (validation set)
validation_predictions = pipeline.predict(year3)

# Append predicted probabilities to year3 dataframe
year3['SwingProbability'] = pipeline.predict_proba(year3)[:, 1]

# Save the dataframe with swing probabilities as CSV
year3.to_csv('year3_validation_with_probs_rf.csv', index=False)  # Changed the filename for Random Forest


In [7]:
# Calculate precision, recall, and F1 score
precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Precision: 0.83
Recall: 0.81
F1 Score: 0.82
