# Uninstall existing versions
!pip uninstall -y dowhy networkx

# Install the latest versions
!pip install dowhy networkx

import pandas as pd
import numpy as np
import networkx as nx
from dowhy import CausalModel
import statsmodels.api as sm

In [None]:
!pip uninstall -y networkx

# 2. Install specific compatible version
!pip install networkx==2.8.8

# 3. Verify downgrade
import networkx as nx
print(f"NetworkX version: {nx.__version__}")

import pandas as pd
import numpy as np
!pip install dowhy
from dowhy import CausalModel
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

Found existing installation: networkx 3.5
Uninstalling networkx-3.5:
  Successfully uninstalled networkx-3.5
Collecting networkx==2.8.8
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: networkx
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia

In [None]:
# 1. Define the Causal Graph
# The graph is defined by listing the edges (parent, child)
# Based on the provided DAG image and OCR
causal_graph_edges = [
    ('REC_AGE', 'OPEN'),
    ('ROUTES_RUN', 'OPEN'),
    ('ROUTES_RUN', 'TARGETS'),
    ('OPEN', 'TARGETS'),
    ('ROUTES_RUN', 'FIRST_READ'),
    ('FIRST_READ', 'TARGETS'),
    ('O_LINE', 'PASS_BLK'),
    ('O_LINE', 'RUN_BLK'),
    ('O_LINE', 'OFFENSE'),
    ('PASS_BLK', 'QB'),
    ('PASS_BLK', 'ADOT'),
    ('QB', 'PASS_ATT'),
    ('QB', 'OFFENSE'),
    ('QB', 'TARGETS'),
    ('QB', 'ADOT'),
    ('PASS_ATT', 'PASS_YDS'),
    ('PASS_ATT', 'PASS_TDS'),
    ('PASS_ATT', 'ROUTES_RUN'),
    ('OFFENSE', 'SCORE_DIFF'),
    ('DEFENSE', 'SCORE_DIFF'),
    ('ST', 'SCORE_DIFF'),
    ('SCORE_DIFF', 'PASS_ATT'),
    ('SCORE_DIFF', 'RUSH_ATT'),
    ('TARGETS', 'REC'),
    ('TARGETS', 'ADOT'),
    ('REC', 'REC_YDS'),
    ('REC', 'REC_TDS'),
    ('REC', 'WR/TE_FPTS'),
    ('REC', 'RB_FPTS'),
    ('REC_YDS', 'WR/TE_FPTS'),
    ('REC_TDS', 'WR/TE_FPTS'),
    ('REC_YDS', 'RB_FPTS'),
    ('REC_TDS', 'RB_FPTS'),
    ('ADOT', 'REC_YDS'),
    ('RUSH_AGE', 'YPC'),
    ('YPC', 'RUSH_ATT'),
    ('RUN_BLK', 'RUSH_ATT'),
    ('RUN_BLK', 'YPC'),
    ('RUSH_ATT', 'RUSH_YDS'),
    ('RUSH_ATT', 'RUSH_TDS'),
    ('RUSH_YDS', 'RB_FPTS'),
    ('RUSH_TDS', 'RB_FPTS'),
    ('RUSH_YDS', 'WR/TE_FPTS'),
    ('RUSH_TDS', 'WR/TE_FPTS'),
    ('RUSH_YDS', 'QB_FPTS'),
    ('RUSH_TDS', 'QB_FPTS'),
    ('PASS_YDS', 'QB_FPTS'),
    ('PASS_TDS', 'QB_FPTS')
]

# Create a NetworkX DiGraph
G = nx.DiGraph(causal_graph_edges)

In [None]:
# 2. Generate Synthetic Data
def generate_synthetic_data(num_samples=1000):
    np.random.seed(27) # for reproducibility
    data = pd.DataFrame()

    # Root nodes (no parents)
    data['REC_AGE'] = np.clip(np.random.normal(25, 3, num_samples), 21, None) # Reciever age - can't be below 21
    #data['ROUTES_RUN'] = np.random.normal(300, 50, num_samples) # Routes run per season
    data['O_LINE'] = np.random.normal(75, 10, num_samples) # Offensive line quality (e.g., PFF grade)
    #data['FIRST_READ'] = np.random.normal(0.6, 0.1, num_samples) # QB's tendency to target first read
    data['RUSH_AGE'] = np.clip(np.random.normal(24, 2, num_samples), 21, None) # RB age
    #data['OFFENSE'] = np.random.normal(25, 5, num_samples) # Offensive efficiency score
    data['DEFENSE'] = np.random.normal(20, 5, num_samples) # Defensive efficiency score
    data['ST'] = np.random.normal(5, 2, num_samples) # Special Teams contribution

    # Blocking
    data['PASS_BLK'] = 0.85 * data['O_LINE'] + np.random.normal(0, 5, num_samples) # Pass blocking quality - mostly completely dependent on oline - maybe some on RB/TE but that is neg;ible
    data['RUN_BLK'] = 0.7 * data['O_LINE'] + np.random.normal(0, 5, num_samples) # Run blocking quality - also semi dependent on TE and WR

    #QB, Offense, and DIFF
    data['QB'] = 0.1 * data['O_LINE'] + 0.2 * data['PASS_BLK'] + np.random.normal(0, 10, num_samples) # QB quality/performance
    data['OFFENSE'] = 0.7 * data['QB'] + 0.1 * data['O_LINE'] + np.random.normal(25, 5, num_samples) # Offensive efficiency score
    data['SCORE_DIFF'] = 0.6 * data['OFFENSE'] - 0.5 * data['DEFENSE'] + 0.2 * data['ST'] + np.random.normal(0, 7, num_samples) # Game score differential

    #Everything Targets
    data['PASS_ATT'] = 0.6 * data['QB'] + 0.3 * data['SCORE_DIFF'] + np.random.normal(0, 15, num_samples) # Pass attempts
    data['ROUTES_RUN'] = 0.7 * data['PASS_ATT'] +  np.random.normal(300, 50, num_samples) # Routes run per season
    data['OPEN'] = 0.3 * data['REC_AGE'] + 0.3 * data['ROUTES_RUN'] + np.random.normal(0, 5, num_samples) # Player's ability to get open
    data['FIRST_READ'] = 0.2 * data['ROUTES_RUN'] + np.random.normal(0.6, 0.1, num_samples) # first reads get targetted more
    data['TARGETS'] = 0.2 * data['ROUTES_RUN'] + 0.3 * data['OPEN'] + 0.2 * data['FIRST_READ'] + 0.2 * data['QB'] + np.random.normal(0, 20, num_samples) # Player targets
    data['REC'] = 0.7 * data['TARGETS'] + np.random.normal(0, 10, num_samples) # Receptions
    data['ADOT'] = 0.3 * data['TARGETS'] + 0.3 * data['QB'] + 0.3 * data['PASS_BLK'] + np.random.normal(0, 2, num_samples) # Average Depth of Target

    # Performance metrics
    data['REC_YDS'] = 0.9 * data['REC'] + 0.5 * data['ADOT'] + np.random.normal(0, 20, num_samples) # Receiving Yards
    data['REC_TDS'] = 0.1 * data['REC'] + 0.05 * data['REC_YDS'] + np.random.normal(0, 1, num_samples) # Receiving Touchdowns

    #Rushing
    data['YPC'] = 0.7 * data['RUSH_AGE'] + 0.5 * data['RUN_BLK'] + np.random.normal(0, 1.5, num_samples) # Yards per carry (influenced by age)
    data['RUSH_ATT'] = 0.5 * data['YPC'] + 0.4 * data['RUN_BLK'] - 0.2 * data['SCORE_DIFF'] + np.random.normal(0, 10, num_samples) # Rush attempts

    # Performance metrics
    data['RUSH_YDS'] = 0.8 * data['RUSH_ATT'] + 0.6 * data['YPC'] + np.random.normal(0, 25, num_samples) # Rushing Yards
    data['RUSH_TDS'] = 0.1 * data['RUSH_ATT'] + 0.05 * data['RUSH_YDS'] + np.random.normal(0, 1, num_samples) # Rushing Touchdowns


    #data['PASS_YDS'] = 0.9 * data['PASS_ATT'] + 0.7 * data['QB'] + np.random.normal(0, 50, num_samples) # Passing Yards
    #data['PASS_TDS'] = 0.1 * data['PASS_ATT'] + 0.05 * data['PASS_YDS'] + np.random.normal(0, 2, num_samples) # Passing Touchdowns

    data['PASS_YDS'] = 0.9 * data['PASS_ATT'] + np.random.normal(0, 50, num_samples) # Passing Yards
    data['PASS_TDS'] = 0.1 * data['PASS_ATT'] + np.random.normal(0, 2, num_samples) # Passing Touchdowns

    # Fantasy Points (Outcomes)
    # Standard scoring: 1 point per 10 receiving/rushing yards, 6 points per receiving/rushing TD, 4 points per passing TD, 1 point per 25 passing yards
    data['WR/TE_FPTS'] = (data['REC_YDS'] / 10) + (data['REC_TDS'] * 6) + (data['REC'] * 1) # Assuming 0.5 PPR
    data['RB_FPTS'] = (data['RUSH_YDS'] / 10) + (data['RUSH_TDS'] * 6) + (data['REC'] * 1) + (data['REC_YDS'] / 10) + (data['REC_TDS'] * 6) # RBs also get receiving points
    data['QB_FPTS'] = (data['PASS_YDS'] / 25) + (data['PASS_TDS'] * 4) + (data['RUSH_YDS'] / 10) + (data['RUSH_TDS'] * 6)# QBs also get rushing points

    return data

In [None]:
# Generate sample data
sample_data = generate_synthetic_data(num_samples=1000)
print("Sample Data Head:")
print(sample_data.head())
print("\nSample Data Description:")
print(sample_data.describe())

Sample Data Head:
     REC_AGE     O_LINE   RUSH_AGE    DEFENSE        ST   PASS_BLK    RUN_BLK  \
0  28.856816  76.027941  22.807111  20.806464  4.831909  64.599976  53.133688   
1  24.089340  85.709214  25.558958  23.500955  7.483635  72.495699  57.130675   
2  26.857227  75.503634  23.264869  15.294023  6.200368  69.183960  62.180602   
3  26.187996  83.498318  24.964485  15.145495  6.430333  74.448858  56.041106   
4  25.670217  80.425170  24.246238  20.562220  4.270015  63.216158  49.564386   

          QB    OFFENSE  SCORE_DIFF  ...    REC_TDS        YPC   RUSH_ATT  \
0  35.899655  54.970145   14.017904  ...   8.945955  43.452788  43.785197   
1  25.141855  43.566063    8.470577  ...  10.647949  45.096587  40.236296   
2  18.232314  49.453363   25.568432  ...  13.218998  48.253737  47.662130   
3  29.008834  51.937236   34.503491  ...  14.016065  46.733464  63.729032   
4  10.743446  40.656442   10.257548  ...  11.528714  41.326856  45.627809   

    RUSH_YDS   RUSH_TDS   PASS_Y

In [None]:
# 3. Causal Effect Estimation for WR/TE_FPTS
# Let's estimate the causal effect of 'TARGETS' on 'WR/TE_FPTS'

# Convert graph to DOT format for DoWhy
dot_graph = 'digraph {'
for u, v in G.edges():
    dot_graph += f'"{u}" -> "{v}";'
dot_graph += '}'

# Initialize the CausalModel
model_wr_te = CausalModel(
    data=sample_data,
    graph=dot_graph,
    treatment='TARGETS',
    outcome='WR/TE_FPTS'
)

# Identify the causal estimand
identified_estimand_wr_te = model_wr_te.identify_effect(proceed_when_unidentifiable=True)
print("\nIdentified Estimand for TARGETS -> WR/TE_FPTS:")
print(identified_estimand_wr_te)

# Estimate the causal effect using a linear regression estimator
# We'll use the 'backdoor' criterion which is common for DAGs
causal_estimate_wr_te = model_wr_te.estimate_effect(
    identified_estimand_wr_te,
    method_name="backdoor.linear_regression",
    control_value=sample_data['TARGETS'].min(),
    treatment_value=sample_data['TARGETS'].max()
)
print("\nCausal Estimate (TARGETS on WR/TE_FPTS):")
print(causal_estimate_wr_te)
print(f"Estimated Causal Effect (ATE): {causal_estimate_wr_te.value}")

ERROR:dowhy.causal_graph:Error: Pygraphviz cannot be loaded. No module named 'pygraphviz'
Trying pydot ...



Identified Estimand for TARGETS -> WR/TE_FPTS:
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
    d                                  
──────────(E[WR/TE_FPTS|QB,SCORE_DIFF])
d[TARGETS]                             
Estimand assumption 1, Unconfoundedness: If U→{TARGETS} and U→WR/TE_FPTS then P(WR/TE_FPTS|TARGETS,QB,SCORE_DIFF,U) = P(WR/TE_FPTS|TARGETS,QB,SCORE_DIFF)

### Estimand : 2
Estimand name: iv
No such variable(s) found!

### Estimand : 3
Estimand name: frontdoor
No such variable(s) found!


Causal Estimate (TARGETS on WR/TE_FPTS):
*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
    d                                  
──────────(E[WR/TE_FPTS|QB,SCORE_DIFF])
d[TARGETS]                             
Estimand assumption 1, Unconfoundedness: If U→{TARGETS} and U→WR/TE_FPTS then P(WR/TE_FPTS|TARGETS,QB,SCORE_DIFF,U) 

In [None]:
# 4. Prediction with Causal Insights
# To predict the actual fantasy points, we can train a predictive model
# using the variables that causally influence the outcome.
# For WR/TE_FPTS, key causal factors from the DAG include REC_YDS, REC_TDS, REC, ADOT, TARGETS, etc.
# We'll use a simple linear regression model for demonstration.

# Define features for WR/TE_FPTS prediction based on the DAG's direct and indirect influences
# We'll pick some variables that are upstream or directly influence WR/TE_FPTS

#features_wr_te = ['REC_YDS', 'REC_TDS', 'REC', 'TARGETS', 'ADOT', 'REC_AGE', 'ROUTES_RUN', 'OPEN', 'FIRST_READ']

features_wr_te = ['TARGETS', 'ADOT', 'REC_AGE', 'ROUTES_RUN', 'OPEN', 'FIRST_READ']
target_wr_te = 'WR/TE_FPTS'

# Prepare data for prediction model
X_wr_te = sample_data[features_wr_te]
y_wr_te = sample_data[target_wr_te]

# Add a constant for the intercept in statsmodels
X_wr_te = sm.add_constant(X_wr_te)

# Train the linear regression model
prediction_model_wr_te = sm.OLS(y_wr_te, X_wr_te).fit()
print("\nPrediction Model Summary (WR/TE_FPTS):")
print(prediction_model_wr_te.summary())

sample_player_data_raw = pd.DataFrame({
    'TARGETS': [100],
    'ADOT': [10.5],
    'REC_AGE': [26],
    'ROUTES_RUN': [450],
    'OPEN': [200],
    'FIRST_READ': [50]
})

'''
# Demo prediction with sample data
sample_player_data_raw = pd.DataFrame({
    'REC_YDS': [1000],
    'REC_TDS': [8],
    'REC': [70],
    'TARGETS': [100],
    'ADOT': [10.5],
    'REC_AGE': [26],
    'ROUTES_RUN': [450],
    'OPEN': [70],
    'FIRST_READ': [0.55]
})


sample_player_data = sm.add_constant(sample_player_data)

predicted_fpts = prediction_model_wr_te.predict(sample_player_data)
print(f"\nPredicted WR/TE_FPTS for sample player: {predicted_fpts[0]:.2f}")
'''

# Select the features for prediction data
sample_player_data = sample_player_data_raw[features_wr_te]

# Add the constant column explicitly, ensuring it's of the correct type (usually float)
sample_player_data['const'] = 1.0

# Reorder the columns to match the training data (X_wr_te)
sample_player_data_final = sample_player_data[X_wr_te.columns]

# Now predict using the correctly formatted data
predicted_fpts = prediction_model_wr_te.predict(sample_player_data_final)
print(f"\nPredicted WR/TE_FPTS for sample player: {predicted_fpts[0]:.2f}")


Prediction Model Summary (WR/TE_FPTS):
                            OLS Regression Results                            
Dep. Variable:             WR/TE_FPTS   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.777
Method:                 Least Squares   F-statistic:                     579.7
Date:                Tue, 24 Jun 2025   Prob (F-statistic):          2.69e-320
Time:                        15:41:28   Log-Likelihood:                -4502.2
No. Observations:                1000   AIC:                             9018.
Df Residuals:                     993   BIC:                             9053.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       

In [None]:
# How to use with real-world data:
# 1. Load your real-world data into a pandas DataFrame.
#    Example: real_data = pd.read_csv('your_fantasy_data.csv')
# 2. Ensure your column names match the nodes in the DAG (e.g., 'REC_AGE', 'TARGETS', 'WR/TE_FPTS').
# 3. You can then use the trained `prediction_model_wr_te` (or similar models for RB/QB)
#    to predict fantasy points for new players, ensuring their data has the same features.
#    Example: new_player_real_data = pd.DataFrame(...)
#             new_player_real_data = sm.add_constant(new_player_real_data[features_wr_te])
#             real_predicted_fpts = prediction_model_wr_te.predict(new_player_real_data)

# Example for RB_FPTS prediction
features_rb = ['RUSH_YDS', 'RUSH_TDS', 'RUSH_ATT', 'YPC', 'RUN_BLK', 'RUSH_AGE', 'REC', 'REC_YDS', 'REC_TDS'] # Including receiving for RBs
target_rb = 'RB_FPTS'
X_rb = sample_data[features_rb]
y_rb = sample_data[target_rb]
X_rb = sm.add_constant(X_rb)
prediction_model_rb = sm.OLS(y_rb, X_rb).fit()
print("\nPrediction Model Summary (RB_FPTS):")
print(prediction_model_rb.summary())

# Example for QB_FPTS prediction
features_qb = ['PASS_YDS', 'PASS_TDS', 'PASS_ATT', 'QB', 'RUSH_YDS', 'RUSH_TDS'] # Including rushing for QBs
target_qb = 'QB_FPTS'
X_qb = sample_data[features_qb]
y_qb = sample_data[target_qb]
X_qb = sm.add_constant(X_qb)
prediction_model_qb = sm.OLS(y_qb, X_qb).fit()
print("\nPrediction Model Summary (QB_FPTS):")
print(prediction_model_qb.summary())
