In [4]:
'''
Now here is where I am trying a Linear Regression Model because I want to be able to predict how many points a player will
have in fantasy. And because I don't have a stagnant target, linear regression makes the most sense.
'''




import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from itertools import combinations
from tabulate import tabulate

# Specify the path to your CSV file
csv_file_path = r"C:\Users\sulli\OneDrive\Documents\last_season_all_team_and_player_data.csv"

# Create the DataFrame
last_season_all_team_data = pd.read_csv(csv_file_path)

# Load the dataset
data = last_season_all_team_data

# Drop specified fields
fields_to_drop = ['player_id',
                  'current_fantasy_points_ppr', 'season','player_name',
                  'game_type','first_name','last_name',
                  'entry_year','rookie_year'
                 ]

data = data.drop(columns=fields_to_drop)

# Perform one-hot encoding for string variables
encoded_data = pd.get_dummies(data)

# Select features and target variable
target = data['current_fantasy_points']
features = encoded_data.drop(columns=['current_fantasy_points'])  # Exclude the target variable

# Calculate correlation matrix
correlation_matrix = features.corrwith(target)

# Sort the correlations in descending order
sorted_correlations = correlation_matrix.sort_values(ascending=False)

# Initialize Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the data
rf.fit(features, target)

# Get feature importances
rf_feature_importances = rf.feature_importances_

# Initialize Gradient Boosting Regressor
gbm = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the model to the data
gbm.fit(features, target)

# Get feature importances
gbm_feature_importances = gbm.feature_importances_

'''Select the features for linear regression. I selected these fields because they had information on the player stats,
information about the player themselves, and information about the team they played against. There are a lot of fantasy_points
related fields that show high importance but they overlap with each other, so I tried to only choose one.
Also past 3 years showed more importance to last season information, which makes sense. The more data you have, the easier it can
be to predict.  But, last_season_targets for whatever reason was higher than the one that was 3 years. So I used that instead.
'''


selected_features = [
    'Total Pro bowls defenders faced in past 3 years',
    'last_season_total_yards_mean',
    'last_season_fantasy_points_mean',
    'last_season_fantasy_points_sum',
    'Total pro bowls offensive teammates past 3 years',
    'last_season_rushing_yards_mean',
    'last_2_seasons_fantasy_points_mean',
    'last_2_seasons_fantasy_points_ppr_mean',
    'last_3_seasons_fantasy_points_ppr_mean',
    'last_season_fantasy_points_ppr_sum',
    'last_2_seasons_total_yards_over_100_mean',
    'last_3_seasons_total_yards_over_100_mean',
    'Total Pro bowls defensive linemen faced in past 3 years',
    'last_3_seasons_fantasy_points_mean',
    'last_weeks_of_last_season_fantasy_points_ppr_sum',
    'last_3_seasons_total_yards_mean',
    'last_2_seasons_total_yards_mean',
    'last_weeks_of_last_season_fantasy_points_sum',
    'draft_number',
    'last_weeks_of_last_season_total_yards_mean',
    'Total Pro bowls defenders faced in past year',
    'Amount of Pro Bowls in Last 3 Years',
    'last_weeks_of_last_season_fantasy_points_median',
    'Total Pro bowls offensive linemen teammates in past 3 years',
    'round',
    'weight',
    'Avg Pro bowls defenders faced in past 3 years'
]

# Select features and target variable for linear regression
X = data[selected_features]
y = data['current_fantasy_points']

# Initialize Linear Regression model
linear_reg = LinearRegression()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model to the training data
linear_reg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = linear_reg.predict(X_test)



# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Create a DataFrame to store the results so I can see them.
results_df = pd.DataFrame({
    'Feature': features.columns,
    'Correlation': correlation_matrix,
    'RF Feature Importance': rf_feature_importances,
    'GBM Feature Importance': gbm_feature_importances
})

# Sort the DataFrame by RF Feature Importance in descending order
results_df_sorted = results_df.sort_values(by='RF Feature Importance', ascending=False)

# Display the combination of features with the highest R-squared
print('Here are what features seem the most important ')
print(results_df_sorted.head(10))


# Generate feature combinations and evaluate them with different train-test splits. I wanted to fully understand the best parameters to use
results = []

# Loop through different test sizes
for test_size in [0.2, 0.3, 0.4]:
    # Loop through different random states
    for random_state in [42, 10, 100]:
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Loop through different feature combinations
        for i in range(10, len(selected_features) + 1):
            for combination in combinations(selected_features, i):
                selected_features_comb = list(combination)

                # Select features and target variable for linear regression
                X_comb = data[selected_features_comb]

                # Fit the model to the training data
                linear_reg.fit(X_train, y_train)

                # Make predictions on the testing data
                y_pred_comb = linear_reg.predict(X_test)

                # Calculate evaluation metrics
                mse_comb = mean_squared_error(y_test, y_pred_comb)
                r2_comb = r2_score(y_test, y_pred_comb)
                mae_comb = mean_absolute_error(y_test, y_pred_comb)

                results.append({
                    'Test Size': test_size,
                    'Random State': random_state,
                    'Features': ', '.join(selected_features_comb),
                    'MSE': mse_comb,
                    'R-squared': r2_comb,
                    'MAE': mae_comb
                })
                # Create a DataFrame from the results
results_comb_df = pd.DataFrame(results)


output_file_path = r"C:\Users\sulli\OneDrive\Documents\feature_combinations_results.csv"

results_comb_df.to_csv(output_file_path, index=False)

# Sort the results_comb_df DataFrame based on R-squared values in descending order
sorted_results_df = results_comb_df.sort_values(by='R-squared', ascending=False)

# Display the combination of features with the highest R-squared
best_result = sorted_results_df.iloc[0]  # Get the row with the highest R-squared

# Display the combination of features with the highest R-squared in a table. This tells me the ideal combination to use
print("Best combination of features based on R-squared:")
print(tabulate([best_result], headers='keys', tablefmt='pretty'))

# If you want to save this best result to a CSV file
best_result.to_csv(r"C:\Users\sulli\OneDrive\Documents\best_feature_combination.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\sulli\\OneDrive\\Documents\\last_season_all_team_and_player_data.csv'

In [None]:
# Select the features
selected_features = ['Total Pro bowls defenders faced in past 3 years',
                     'last_3_seasons_rushing_tds_sum',
                     'Total pro bowls offensive teammates past 3 years',
                     'last_3_seasons_rushing_yards_sum', 'draft_number',
                     'Pro Bowl Count in Past 3 Years', 'last_season_targets_sum',
                     'last_3_seasons_fantasy_points_mean', 'last_3_seasons_rushing_carries_sum',
                     'Total pro bowls offensive line teammates past 3 years', 'years_exp'
]

# Select features and target variable
X = data[selected_features]
y = data['current_fantasy_points']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

# Initialize Linear Regression model
linear_reg = LinearRegression()

# Fit the model to the training data
linear_reg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = linear_reg.predict(X_test)

# Ensure that negative predictions are set to 0
y_pred = [max(0, pred) for pred in y_pred]

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Filter the dataset to include only the 2023 season
data_2023 = last_season_all_team_data[last_season_all_team_data['season'] == 2023]

# Select only the desired data points
selected_columns = selected_features + ['current_fantasy_points']

data_2023_selected = data_2023[selected_columns]

# Make predictions on the data_2023_selected dataset
data_2023_selected['predicted_fantasy_points'] = linear_reg.predict(data_2023_selected[selected_features])

# Ensure that negative predictions are set to 0
data_2023_selected['predicted_fantasy_points'] = data_2023_selected['predicted_fantasy_points'].apply(lambda x: max(0, x))

# Calculate the difference based on the specified condition
data_2023_selected['difference'] = data_2023_selected.apply(
    lambda row: abs(row['current_fantasy_points'] - row['predicted_fantasy_points']), axis=1)

# Calculate the desired statistics
num_rows = len(data_2023_selected)
average_difference = data_2023_selected['difference'].mean()
average_difference_per_game = average_difference / 17

# Print the results in a table
table_data = [
    ["Number of Running Backs", num_rows],
    ["Average Difference", average_difference],
    ["Average Difference per Game", average_difference_per_game]
]

print(tabulate(table_data, headers=["Metric", "Value"], tablefmt="pretty"))

# Save the results to a CSV file
output_file_path = r"C:\Users\sulli\OneDrive\Documents\data_2023_selected.csv"
data_2023_selected.to_csv(output_file_path, index=False)

'''
So the average difference between predicted and actual is 22.94 but if you divide that by the amount of games per season, which
is 17, you get 1.35. That means I am only off 1.35 per game which is something I am happy about because of how close it is.
When Yahoo or ESPN predict how running backs will do, most of the time they are off by an average of about 2-5 per week. So
the fact I am able to build something for running backs that could rival them is an achievement. This shows I can potentially
use this for 2024 and be able to win fantasy or even better, take this to a business like DraftKings or FanDuel, or even ESPN,
and be able to show them the model and show them it can work based on the data collected.  This also showed me that adding
pro bowlers for defense turned out better than I imagined considering it was a top two feature. I thought it would be important
but not this important. When I looked at the 2023 data more closely, I saw that for rookies, I was actually not far off as
well, which was a worry point because they have really no past data to look at.  It shows being a high draft pick in the first
round matters.

In conclusion, this shows my model does have potential but there are some more specifics I would love to look at like how
fast a RB is or how good of a QB they are working with is.  However, what I feel I've proven is that I made a model that
can combat the likes of ESPN. For that, I am happy. The next step is figuring out how to project on a week to week basis how
well someone will do.
'''