# DML Getting Started
[Source](https://docs.doubleml.org/stable/intro/intro.html)

In [2]:
!pip install doubleml -q

In [3]:
import numpy as np
from doubleml.datasets import fetch_bonus

In [4]:
df_bonus = fetch_bonus('DataFrame')
print(df_bonus.head(5))

   index   abdt  tg  inuidur1  inuidur2  female  black  hispanic  othrace  \
0      0  10824   0  2.890372        18       0      0         0        0   
1      3  10824   0  0.000000         1       0      0         0        0   
2      4  10747   0  3.295837        27       0      0         0        0   
3     11  10607   1  2.197225         9       0      0         0        0   
4     12  10831   0  3.295837        27       0      0         0        0   

   dep  ...  recall  agelt35  agegt54  durable  nondurable  lusd  husd  muld  \
0    2  ...       0        0        0        0           0     0     1     0   
1    0  ...       0        0        0        0           0     1     0     0   
2    0  ...       0        0        0        0           0     1     0     0   
3    0  ...       0        1        0        0           0     0     0     1   
4    1  ...       0        0        1        1           0     1     0     0   

   dep1  dep2  
0   0.0   1.0  
1   0.0   0.0  
2   0.0 

In [6]:
from doubleml import DoubleMLData

dml_data_bonus = DoubleMLData(
    df_bonus,
    y_col='inuidur1',
    d_cols='tg',
    x_cols=[
        'female', 'black', 'othrace', 'dep1', 'dep2',
        'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54',
        'durable', 'lusd', 'husd'
    ]
)
 

print(dml_data_bonus)


------------------ Data summary      ------------------
Outcome variable: inuidur1
Treatment variable(s): ['tg']
Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd']
Instrument variable(s): None
No. Observations: 5099

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5099 entries, 0 to 5098
Columns: 26 entries, index to dep2
dtypes: float64(3), int64(23)
memory usage: 1.0 MB



In [9]:
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV

learner = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 5)
ml_l_bonus = clone(learner)
ml_m_bonus = clone(learner)

In [10]:
from doubleml import DoubleMLPLR

np.random.seed(3141)
obj_dml_plr_bonus = DoubleMLPLR(dml_data_bonus, ml_l_bonus, ml_m_bonus)
obj_dml_plr_bonus.fit();
print(obj_dml_plr_bonus)


------------------ Data summary      ------------------
Outcome variable: inuidur1
Treatment variable(s): ['tg']
Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd']
Instrument variable(s): None
No. Observations: 5099

------------------ Score & algorithm ------------------
Score function: partialling out

------------------ Machine learner   ------------------
Learner ml_l: RandomForestRegressor(max_depth=5, max_features='sqrt', n_estimators=500)
Learner ml_m: RandomForestRegressor(max_depth=5, max_features='sqrt', n_estimators=500)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[1.20030664]]
Learner ml_m RMSE: [[0.47419634]]

------------------ Resampling        ------------------
No. folds: 5
No. repeated sample splits: 1

------------------ Fit summary       ------------------
        coef   std err         t     P>|t|     2.5 %    97.5 %
tg -0.076686  0.035411 -2.165608  0.030341 -0.1