In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import seaborn as sns
from collections import defaultdict
import logging
logging.basicConfig()

from diamond.glms.logistic import LogisticRegression

%matplotlib inline

This dataset is from the UCI Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Soybean+(Small)

Origin: 

Michalski,R.S. 
Learning by being told and learning from examples: an experimental comparison of the two methodes of knowledge acquisition in the context of developing an expert system for soybean desease diagnoiss", 
International Journal of Policy Analysis and Information Systems, 1980, 4(2), 125-161. 

The example here is not intended to be statistically useful, but rather just to show the nuts and bolts of fitting a model using Diamond.

# Load the data and the covariance structure

In [2]:
df = pd.read_csv('soybean_small.csv')
df.head()

Unnamed: 0,date,plant_stand,precip,temp,hail,crop_hist,area_damaged,severity,seed_tmt,germination,...,sclerotia,fruit_pods,fruit spots,seed,mold_growth,seed_discolor,seed_size,shriveling,roots,NA
0,4,0,2,1,1,1,0,1,0,2,...,0,0,4,0,0,0,0,0,0,D1
1,5,0,2,1,0,3,1,1,1,2,...,0,0,4,0,0,0,0,0,0,D1
2,3,0,2,1,0,2,0,2,1,1,...,0,0,4,0,0,0,0,0,0,D1
3,6,0,2,1,0,1,1,1,0,0,...,0,0,4,0,0,0,0,0,0,D1
4,4,0,2,1,0,3,0,2,0,2,...,0,0,4,0,0,0,0,0,0,D1


Note that the covariance structure, here referred to as the priors, is only for the random effect components. The fixed effects are not regularized in Diamond. The priors data frame has the following columns:
  - grp: the grouping factor (in this case "leaves") that the variables belong to
  - var1: variable in the model which belongs to group factor `grp`
  - var2: same as var1. Set to NaN to denote the variance of var1
  - vcov: (co)variance between var1 and var2
  - sdcor: square root of vcov (optional - not used in Diamond)

In [3]:
priors = pd.read_csv("soybean_small_priors.csv")
priors.head()

Unnamed: 0,group,var1,var2,vcov,sdcor
0,leaves,intercept,,0.012599,0.112246
1,leaves,precip,,0.049492,0.222468


# Fit the model

In [4]:
model = LogisticRegression(df, priors)

In [5]:
formula = 'fruiting_bodies ~ 1 + precip + (1 + precip | leaves)'

In [6]:
effects = model.fit(formula, tol=1e-4, verbose=True)

INFO:diamond.glms.glm:creating main design matrix
INFO:diamond.glms.glm:creating leaves design matrix
INFO:diamond.glms.glm:creating covariance matrix
INFO:diamond.glms.logistic:creating Hessians
INFO:diamond.glms.logistic:creating H_inter for leaves
INFO:diamond.glms.logistic:time elapsed: 0.0
INFO:diamond.glms.logistic:blocks inverted: 0 of 2
INFO:diamond.glms.logistic:creating H_invs
INFO:diamond.solvers.diamond_logistic:loss: 477.033641 penalty: 1.901035 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 0 relative coef change: 1.000000 obj: 478.934676
INFO:diamond.solvers.diamond_logistic:loss: 472.582817 penalty: 1.320383 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 1 relative coef change: 0.356817 obj: 473.903200
INFO:diamond.solvers.diamond_logistic:loss: 469.269012 penalty: 1.155364 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 2 relative coef change: 0.262381 obj: 470.424376
INFO:diamond.solvers.diamond_logistic:loss:

INFO:diamond.solvers.diamond_logistic:loss: 676.699315 penalty: 1.331745 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 41 relative coef change: 0.004280 obj: 678.031060
INFO:diamond.solvers.diamond_logistic:loss: 676.894417 penalty: 1.331816 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 42 relative coef change: 0.000330 obj: 678.226233
INFO:diamond.solvers.diamond_logistic:loss: 676.913700 penalty: 1.331936 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 43 relative coef change: 0.000151 obj: 678.245636
INFO:diamond.solvers.diamond_logistic:loss: 678.563069 penalty: 1.332019 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 44 relative coef change: 0.005238 obj: 679.895089
INFO:diamond.solvers.diamond_logistic:loss: 678.756947 penalty: 1.332047 seconds elapsed 0
INFO:diamond.solvers.diamond_logistic:iteration: 45 relative coef change: 0.000305 obj: 680.088993
INFO:diamond.solvers.diamond_logistic:loss: 678.74

# Combine random and fixed effects

In [7]:
results = model.results_dict['leaves'].copy()

In [8]:
for var in model.results_dict["fixed_effects"].variable.unique():
    if var in results.columns:
        results[var] = (results[var] + 
            model.results_dict['fixed_effects'][model.results_dict['fixed_effects'].variable == var].value.values[0])
    else:
        results[var] = (
            model.results_dict['fixed_effects'][model.results_dict['fixed_effects'].variable == var].value.values[0])

In [9]:
results

Unnamed: 0,leaves,intercept,precip
0,0,-27.603985,13.101506
1,1,-27.55917,13.4536
