# Demo of scikit-learn models

In [None]:
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt

In [None]:
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
export_images = False

## Synthetic control

In [None]:
from causalpy.simulate_data import generate_synthetic_control_data

treatment_time = 70
df, weightings_true = generate_synthetic_control_data(treatment_time=treatment_time)

First we use a common model where regression coefficients are constrained to the interval [0, 1] and to sum to 1. This uses a custom `WeightedProportion` class.

In [None]:
from causalpy.skl_models import WeightedProportion  
from causalpy.skl_experiments import SyntheticControl

# Note, we do not want an intercept in this model
thing = SyntheticControl(df, 
                         treatment_time, 
                         formula = "actual ~ 0 + a + b + c + d + e + f + g",
                         prediction_model=WeightedProportion())

fig, ax = thing.plot()

In [None]:
thing.plot_coeffs()

But we can see that (for this dataset) these estimates are quite bad. So we can lift the "sum to 1" assumption and instead use the `LinearRegression` model, but still constrain weights to be positive. Equally, you could experiment with the `Ridge` model (e.g. `Ridge(positive=True, alpha=100)`).

In [None]:
from causalpy.skl_experiments import SyntheticControl
from sklearn.linear_model import LinearRegression

# Note, we do not want an intercept in this model
thing = SyntheticControl(df, 
                         treatment_time, 
                         formula = "actual ~ 0 + a + b + c + d + e + f + g",
                         prediction_model=LinearRegression(positive=True))

fig, ax = thing.plot()

if export_images:
    plt.savefig('../img/interrupted_time_series_skl.png', bbox_inches='tight', dpi=300, facecolor="white")

In [None]:
thing.plot_coeffs()

## Interrupted time series

In [None]:
# # Generate data
# from causalpy.simulate_data import generate_time_series_data

# df = generate_time_series_data("2017-01-01")
# df = df.loc[:, ['month', 'year', 't', 'y']]
# df.to_csv("../causalpy/data/its.csv")

In [None]:
# Load data
df = pd.read_csv('../causalpy/data/its.csv', parse_dates=["date"])
df.set_index("date", inplace=True)
treatment_time = pd.to_datetime("2017-01-01")

In [None]:
from causalpy.skl_experiments import InterruptedTimeSeries
from sklearn.linear_model import LinearRegression

thing = InterruptedTimeSeries(df, 
                              treatment_time, 
                              formula = 'y ~ 1 + t + C(month)', 
                              prediction_model=LinearRegression())

fig, ax = thing.plot()

if export_images:
    plt.savefig('../img/interrupted_time_series_skl.png', bbox_inches='tight', dpi=300, facecolor="white")

## Difference in Differences

In [None]:
data = pd.read_csv("../causalpy/data/did.csv")

In [None]:
from causalpy.skl_experiments import DifferenceInDifferences
from sklearn.linear_model import LinearRegression

# NOTE: `treated` is a deterministic function of `t` and `group`. So add this function into the formula.

thing = DifferenceInDifferences(data, 
                                formula = 'y ~ 1 + group + t + treated:group',
                                prediction_model=LinearRegression())
fig, ax = thing.plot()

if export_images:
    plt.savefig('../img/difference_in_differences_skl.png', bbox_inches='tight', dpi=300, facecolor="white")

## Regression discontinuity

In [None]:
# # Generate data
# from causalpy.simulate_data import generate_regression_discontinuity_data
# df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
# df.to_csv('../causalpy/data/regression_discontinuity.csv', index=False)

In [None]:
# Load data
data = pd.read_csv('../causalpy/data/regression_discontinuity.csv')

In [None]:
from causalpy.skl_experiments import RegressionDiscontinuity
from sklearn.linear_model import LinearRegression

thing = RegressionDiscontinuity(data, 
                                formula = 'y ~ 1 + x + treated',
                                prediction_model=LinearRegression(),
                                treatment_threshold=0.5)
thing.plot();

In [None]:
thing = RegressionDiscontinuity(data, 
                                formula = 'y ~ 1 + x + treated + x:treated',
                                prediction_model=LinearRegression(), 
                                treatment_threshold=0.5)
thing.plot();

In [None]:
from causalpy.skl_experiments import RegressionDiscontinuity
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared

kernel = 1.0 * ExpSineSquared(1.0, 5.0) + WhiteKernel(1e-1)
thing = RegressionDiscontinuity(data, 
                                formula = 'y ~ 1 + x + treated',
                                prediction_model=GaussianProcessRegressor(kernel=kernel), 
                                treatment_threshold=0.5)
fig, ax = thing.plot()

if export_images:
    plt.savefig('../img/regression_discontinuity_skl.png', bbox_inches='tight', dpi=300, facecolor="white")