In [204]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import scipy as sp
import scipy.stats as st
import sys
import warnings
import inspect
import patsy

FLOAT_EPS = np.finfo(float).eps
pd.options.display.float_format = '{:,.12f}'.format
## Should use __FLOAT_EPS ? See if this variable is visible to user
## when using final package.

class LinkClass(object):
    def __init__(self):
        return NotImplementedError
    def link(self, mu):
        return NotImplementedError
    def link_inv(self, eta):
        return NotImplementedError
    def link_inv_deriv(self, eta):
        return NotImplementedError
        
    
class Logit(LinkClass):
    def __init__(self):
        self.linkclass = sm.genmod.families.links.logit
    def link(self, p):
        return np.log(p/(1.0-p))
        #return sp.special.logit(p)
    def link_inv(self, eta):
        thresh = 30.0
        eta = np.minimum(np.maximum(eta,-thresh), thresh)
        exp_eta = np.exp(-eta)
        return 1.0/(1.0+exp_eta)
        #return sp.special.expit(eta)
    def link_inv_deriv(self, eta):
        #thresh = 30.0
        #eta[abs(eta) > thresh] = FLOAT_EPS
        return np.exp(eta)/(1+np.exp(eta))**2
    def __repr__(self):
        display_string = f"\n    linkstr: logit"
        display_string += '\n    link: log(p/(1-p))'
        display_string += '\n    linkinv: exp(eta)/(1+exp(eta))'
        return display_string

class Probit(LinkClass):
    def __init__(self):
        self.linkclass = sm.genmod.families.links.probit
    def link(self, mu):
        return st.norm.ppf(mu)
    def link_inv(self, eta):
        thresh = -st.norm.ppf(FLOAT_EPS)
        eta = np.minimum(np.maximum(eta,-thresh),thresh)
        return st.norm.cdf(eta)
    def link_inv_deriv(self, eta):
        return np.maximum(st.norm.pdf(eta),FLOAT_EPS)
    def __repr__(self):
        display_string = f"\n    linkstr: probit"
        display_string += '\n    link: norm.ppf(mu)'
        display_string += '\n    linkinv: norm.cdf(eta)'
        return display_string
    
class CLogLog(LinkClass):
    def __init__(self):
        self.linkclass = sm.genmod.families.links.cloglog
    def link(self, mu):
        return np.log(-np.log(1 - mu))
    def link_inv(self, eta):
        return np.maximum(np.minimum(-np.expm1(-np.exp(eta)),1-FLOAT_EPS),FLOAT_EPS)
    def link_inv_deriv(self, eta):
        eta = np.minimum(eta,700)
        return np.maximum(np.exp(eta)*np.exp(-np.exp(eta)),FLOAT_EPS)
    def __repr__(self):
        display_string = f"\n    linkstr: cloglog"
        display_string += '\n    link: log(-log(1 - mu))'
        display_string += '\n    linkinv: 1-exp(-exp(eta))'
        return display_string
    
class Cauchit(LinkClass):
    def __init__(self):
        self.linkclass = sm.genmod.families.links.cauchy
    def link(self, mu):
        return st.cauchy.ppf(mu)
    def link_inv(self, eta):
        thresh = -st.cauchy.ppf(FLOAT_EPS)
        eta = np.minimum(np.maximum(eta,-thresh),thresh)
        return st.cauchy.cdf(eta)
    def link_inv_deriv(self, eta):
        return nnp.maximum(st.cauchy.pdf(eta),FLOAT_EPS)
    def __repr__(self):
        display_string = f"\n    linkstr: cauchit"
        display_string += '\n    link: cauchy.ppf(mu)'
        display_string += '\n    linkinv: cauchy.cdf(eta)'
        return display_string
    
class Log(LinkClass):
    def __init__(self):
        self.linkclass = sm.genmod.families.links.log
    def link(self, mu):
        return np.log(mu)
    def link_inv(self, eta):
        return np.maximum(np.exp(eta), FLOAT_EPS)
    def link_inv_deriv(self, eta):
        return np.maximum(np.exp(eta), FLOAT_EPS)
    def __repr__(self):
        display_string = f"\n    linkstr: log"
        display_string += '\n    link: log(mu)'
        display_string += '\n    linkinv: exp(eta)'
        return display_string
    


class ZeroInflated(object):
    __doc__ = """
    Zero Inflated model for count data
    %(params)s
    %(extra_params)s
    Attributes
    -----------
    formula_str : string
        A reference to the endogenous response variable.
    data : pandas dataframe
        A reference to the exogenous design.
    dist: string
        A reference to the zero-inflated exogenous design.
    link: string
        A reference to 
    """
    def __init__(self, formula_str, data, dist = 'poisson', offsetx = None, offsetz = None,
                 link = 'logit', weights = None, missing='none', **kwargs):
        #endog, exog_count, exog_zero, dist = 'poisson', link = 'logit', weights = None, offsetx = None,\
        #     offsetz = None, method = 'L-BFGS-B', start = None, EM = True, \
        #                                tol = None, options = None, factr = 1.0
        self.set_data(formula_str, df, missing)
        self.terms = {'Y':self.endog.columns.values[0],'X':self.X.columns.values,\
                      'Z':self.Z.columns.values}
        self.formula = formula_str
        self.dist = self.dist_processing(dist)
        self.link = self.link_processing(link)
        self.n = len(self.endog)
        self.set_wt_offset(weights, offsetx, offsetz)
        self.linkobj = self.LinkClass_processing(self.link)
        self.set_loglik(self.dist)
        self.call = f"ZeroInflated(formula_str='{formula_str}', data={self.retrieve_name(data)}, dist='{dist}', offsetx={offsetx}, offsetz={offsetz},"
        self.call = self.call + f" link='{link}', weights={weights}, missing='{missing}')"
        
        # Convenience variables
        self.kx = self.X.shape[1]
        self.kz = self.Z.shape[1]
        self.Y = np.squeeze(self.endog)
        self.Y0 = self.Y <= 0
        self.Y1 = self.Y > 0       
        
        
    def print_obj(self):
        print(self)   
        



    def retrieve_name(self, var):
        """
        Gets the name of var. Does it from the out most frame inner-wards.
        :param var: variable to get name from.
        :return: string
        """
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]
        
    def set_data(self, formula_str, df, missing):
        self.endog, self.X, self.Z = self.formula_processing(formula_str, df, missing=missing) 
     
           
    def set_wt_offset(self, weights, offsetx, offsetz):
        ## weights and offset
        
        if weights is None:
            weights = 1.0
        weights = np.ndarray.flatten(np.array(weights))
        if weights.size == 1:
            weights = np.repeat(weights,self.n)
        weights = pd.Series(data = weights, index = self.X.index)

        if offsetx is None:
            offsetx = 0.0
        offsetx = np.ndarray.flatten(np.array(offsetx))
        if offsetx.size == 1:
            offsetx = np.repeat(offsetx,self.n)

        if offsetz is None:
            offsetz = 0.0
        offsetz = np.ndarray.flatten(np.array(offsetz))
        if offsetz.size == 1:
            offsetz = np.repeat(offsetz,self.n)
        
        self.weights = weights
        self.offsetx = offsetx
        self.offsetz = offsetz
        
    def set_loglik(self, dist):
        if dist is 'poisson':
            self.loglikfun = self.ziPoisson
            self.gradfun = self.gradPoisson
        elif dist is 'negbin':
            self.loglikfun = self.ziNegBin
            self.gradfun = self.gradNegBin
        else:
            self.loglikfun = self.ziGeom
            self.gradfun = self.gradGeom
        
    def ziPoisson(self, parms, sign = 1.0):
        """
        Log-likelihood of Zero Inflated Poisson.
        """
        
        ## count mean
        mu = np.exp(np.dot(self.X,parms[np.arange(self.kx)]) + self.offsetx)
        ## binary mean
        phi = self.linkobj.link_inv(np.dot(self.Z, parms[np.arange(self.kx,self.kx+self.kz)]) +\
                                    self.offsetz)
        ## log-likelihood for y = 0 and y >= 1
        loglik0 = np.log(phi + np.exp(np.log(1-phi) - mu)) 
        loglik1 = np.log(1-phi) + sp.stats.poisson.logpmf(self.Y, mu)
        ## collect and return
        loglik = np.dot(self.weights[self.Y0],loglik0[self.Y0])+np.dot(self.weights[self.Y1],loglik1[self.Y1])
        return sign*loglik

    def gradPoisson(self, parms, sign = 1.0):
        """
        Gradient of Zero Inflated Poisson Log-likelihood.
        """
        
        ## count mean
        eta = np.dot(self.X,parms[np.arange(self.kx)]) + self.offsetx
        mu = np.exp(eta)
        ## binary mean
        etaz = np.dot(self.Z, parms[np.arange(self.kx,self.kx+self.kz)]) + self.offsetz
        muz = self.linkobj.link_inv(etaz)
        ## densities at 0
        clogdens0 = -mu
        dens0 = muz*(1-self.Y1.astype(float)) + np.exp(np.log(1 - muz) + clogdens0)
        ## working residuals  
        wres_count = np.where(self.Y1,self.Y-mu,-np.exp(-np.log(dens0) + 
                                          np.log(1 - muz) + clogdens0 + np.log(mu))) 
        link_etaz = self.linkobj.link_inv_deriv(etaz)
        wres_zero  = np.where(self.Y1,-1/(1-muz) * link_etaz, \
                          (link_etaz - np.exp(clogdens0) * link_etaz)/dens0)   
    
        return sign*(np.hstack((np.expand_dims(wres_count*self.weights,axis=1)*self.X, \
                np.expand_dims(wres_zero*self.weights,axis=1)*self.Z))).sum(axis=0)
    
    def ziNegBin(self, parms, sign = 1.0):
        """
        Log-Likelihood of Zero Inflated Negative Binomial.
        """
        ## count mean
        mu = np.exp(np.dot(self.X,parms[np.arange(self.kx)]) + self.offsetx)
        ## binary mean
        phi = self.linkobj.link_inv(np.dot(self.Z, parms[np.arange(self.kx,self.kx+self.kz)]) + self.offsetz)
        ## negbin size
        theta = np.exp(parms[self.kx+self.kz])
    
        ## log-likelihood for y = 0 and y >= 1 sp.stats.poisson.logpmf(Y, mu)
        loglik0 = np.log(phi + np.exp(np.log(1-phi) + \
                                   st.nbinom.logpmf(0,*self.convert_params(theta = theta, mu = mu)) ) )
        loglik1 = np.log(1-phi) + st.nbinom.logpmf(self.Y,*self.convert_params(theta = theta, mu = mu))

        ## collect and return
        loglik = np.dot(self.weights[self.Y0],loglik0[self.Y0])+np.dot(self.weights[self.Y1],loglik1[self.Y1])
        return sign*loglik
  
    def ziGeom(self, parms, sign = 1.0):
        return self.ziNegBin(np.hstack((parms, 0)), sign)
    
    def gradGeom(self, parms, sign = 1.0):
        """
        Gradient of Zero Inflated Geometric Log-likelihood.
        
        """
        ## count mean
        eta = np.dot(self.X,parms[np.arange(self.kx)]) + self.offsetx
        mu = np.exp(eta)
        ## binary mean
        etaz = np.dot(self.Z, parms[np.arange(self.kx,self.kx+self.kz)]) + self.offsetz
        muz = self.linkobj.link_inv(etaz) 

        ## densities at 0
        clogdens0 = st.nbinom.logpmf(0,*self.convert_params(theta = 1, mu = mu))
        dens0 = muz*(1-self.Y1.astype(float)) + np.exp(np.log(1 - muz) + clogdens0)

        ## working residuals  
        wres_count = np.where(self.Y1,self.Y - mu*(self.Y + 1)/(mu + 1), \
                              -np.exp(-np.log(dens0) + np.log(1 - muz) + clogdens0 +\
                                      -np.log(mu+1) + np.log(mu))) 
        link_etaz = self.linkobj.link_inv_deriv(etaz)
        wres_zero  = np.where(self.Y1,-1/(1-muz) * link_etaz, \
                          (link_etaz - np.exp(clogdens0) * link_etaz)/dens0)
      
        return sign*(np.hstack((np.expand_dims(wres_count*self.weights,axis=1)*self.X, \
                np.expand_dims(wres_zero*self.weights,axis=1)*self.Z))).sum(axis=0)
    
    def gradNegBin(self, parms, sign = 1.0): 
        """
        Gradient of Zero Inflated Negative Binomial Log-likelihood. 
        (Negetive Binomial2 to be specific.)
        
        """
        ## count mean
        eta = np.dot(self.X,parms[np.arange(self.kx)]) + self.offsetx
        mu = np.exp(eta)
        ## binary mean
        etaz = np.dot(self.Z, parms[np.arange(self.kx,self.kx+self.kz)]) + self.offsetz
        muz = self.linkobj.link_inv(etaz)    
        ## negbin size
        theta = np.exp(parms[self.kx+self.kz])

        ## densities at 0
        clogdens0 = st.nbinom.logpmf(0,*self.convert_params(theta = theta, mu = mu))
        dens0 = muz*(1-self.Y1.astype(float)) + np.exp(np.log(1 - muz) + clogdens0)
        
        ## working residuals  
        wres_count = np.where(self.Y1,self.Y - mu*(self.Y + theta)/(mu + theta), \
                              -np.exp(-np.log(dens0) + np.log(1 - muz) + clogdens0 + np.log(theta) +\
                                      -np.log(mu+theta) + np.log(mu))) 
        link_etaz = self.linkobj.link_inv_deriv(etaz)
        wres_zero  = np.where(self.Y1,-1/(1-muz) * link_etaz, \
                          (link_etaz - np.exp(clogdens0) * link_etaz)/dens0)
        
        wres_theta = theta*np.where(self.Y1, sp.special.digamma(self.Y + theta) - sp.special.digamma(theta) +\
                                   np.log(theta) - np.log(mu + theta) + 1 - (self.Y + theta)/(mu + theta),\
                                   np.exp(-np.log(dens0) + np.log(1 - muz) + clogdens0)*\
                                   (np.log(theta) - np.log(mu + theta) + 1 - theta/(mu+theta) ) )
        
        return sign*(np.hstack((np.expand_dims(wres_count*self.weights,axis=1)*self.X, \
                np.expand_dims(wres_zero*self.weights,axis=1)*self.Z, \
                               np.expand_dims(wres_theta,axis=1)))).sum(axis=0)
    
    def EM_estimate(self):
        ## EM estimation of starting values
        
        model_count = sm.GLM(endog = self.Y, exog = self.X, family = sm.families.Poisson(),\
                                  offset = self.offsetx , freq_weights = self.weights).fit()
        model_zero = sm.GLM(self.Y0.astype(int), exog = self.Z, family=sm.families.Binomial(link = self.linkobj.linkclass), \
                   offset = self.offsetz , freq_weights = self.weights).fit()
        self.start = {'zero':model_zero.params, 'count':model_count.params}
        
        if self.dist is 'negbin':
            self.start['theta'] = 1.0 
            
        if (self.EM is True) and (self.dist is 'poisson'):
            mui = model_count.predict()
            probi = model_zero.predict()
            probi = probi/(probi + (1-probi)*sp.stats.poisson.pmf(0, mui))
            probi[self.Y1] = 0
            probi
            ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values)))
            ll_old = 2 * ll_new
    
            while np.absolute((ll_old - ll_new)/ll_old) > self.reltol :
                ll_old = ll_new
                model_count = sm.GLM(endog = self.Y, exog = self.X, family = sm.families.Poisson(),\
                                  offset = self.offsetx , freq_weights = self.weights*(1-probi) \
                                              ).fit(start_params = self.start['count'].values)        
                model_zero = sm.GLM(probi, exog = self.Z, family=sm.families.Binomial(link = self.linkobj.linkclass),\
                        offset = self.offsetz, freq_weights = self.weights \
                               ).fit(start_params = self.start['zero'].values)
                self.start = {'zero':model_zero.params, 'count':model_count.params}

                mui = model_count.predict()
                probi = model_zero.predict()
                probi = probi/(probi + (1-probi)*sp.stats.poisson.pmf(0, mui))
                probi[self.Y1] = 0

                ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values)))           
            
        if (self.EM is True) and (self.dist is 'geom'):
            mui = model_count.predict()
            probi = model_zero.predict()
            probi = probi/(probi + (1-probi)*st.nbinom.pmf(0,*self.convert_params(theta = 1, mu = mui)))
            probi[self.Y1] = 0
            
            ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values)))
            ll_old = 2 * ll_new  
                           
            while np.absolute((ll_old - ll_new)/ll_old) > self.reltol :
                ll_old = ll_new
                model_count = sm.GLM(endog = self.Y, exog = self.X, family = sm.families.NegativeBinomial(alpha = 1.0),\
                                  offset = self.offsetx , freq_weights = self.weights*(1-probi)).fit(\
                                        #start_params = start['count'].values
                                    sm.families.NegativeBinomial(alpha = 1.0\
                                                                ).starting_mu(y=self.start['count'].values))
                model_zero = sm.GLM(probi, exog = self.Z, family=sm.families.Binomial(link = self.linkobj.linkclass),\
                        offset = self.offsetz, freq_weights = self.weights).fit(start_params = self.start['zero'].values)
                self.start = {'zero':model_zero.params, 'count':model_count.params}

                mui = model_count.predict()
                probi = model_zero.predict()
                probi = probi/(probi + (1-probi)*st.nbinom.pmf(0,*self.convert_params(theta = 1, mu = mui)))
                probi[self.Y1] = 0                

                ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values)))
                
        if (self.EM is True) and (self.dist is 'negbin'):
            warnings.warn('EM estimation of starting values not optimal for Negetive Binomial.')
            mui = model_count.predict() # or model_count.mu
            probi = model_zero.predict()
            probi = probi/(probi + (1-probi)*st.nbinom.pmf(0,*self.convert_params(theta = self.start['theta'], mu = mui)))
            probi[self.Y1] = 0
            
            ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values,np.log(self.start['theta']))))
            ll_old = 2 * ll_new 
            
            while np.absolute((ll_old - ll_new)/ll_old) > self.reltol :
                ll_old = ll_new
                model_count = sm.GLM(endog = self.Y, exog = self.X, family = \
                                     sm.families.NegativeBinomial(alpha = 1/self.start['theta']),method = 'newton',\
                                  offset = self.offsetx , freq_weights = self.weights*(1-probi) \
                                      ).fit(start_params = self.start['count'])
                model_zero = sm.GLM(probi, exog = self.Z, family=sm.families.Binomial(link = self.linkobj.linkclass),\
                        offset = self.offsetz, freq_weights = self.weights, \
                        start_params = self.start['zero']).fit()
                
                mui = model_count.predict()   
                theta = sm.GLM(endog = self.Y, exog = self.X, family = \
                                     sm.families.NegativeBinomial(alpha = 1/model_count.scale),method = 'newton',\
                                 offset = self.offsetx , freq_weights = self.weights*(1-probi) \
                                     ).estimate_scale(mui)
                
                probi = model_zero.predict()
                probi = probi/(probi + (1-probi)*st.nbinom.pmf(0,*self.convert_params(theta = theta, mu = mui)))
                
                probi[self.Y1] = 0
                self.start = {'zero':model_zero.params, 'count':model_count.params, 'theta':theta}
                
                ll_new = self.loglikfun(np.hstack((self.start['count'].values,self.start['zero'].values,np.log(self.start['theta']))))

     
    def fit(self, method = 'BFGS', EM = True, start = None, reltol = None,\
            options = {'disp': False, 'maxiter': 10000, 'gtol': 1e-8}, factr = 1.0):
        self.set_tolerance(factr, reltol)
        self.optim_options = options
        self.optim_options['gtol'] = self.reltol
        self.method = method
        self.EM = EM
        self.set_start(start)
        
        ## ML Estimation
        if (self.dist is 'negbin'):
            x0 = np.hstack((self.start['count'].values,self.start['zero'].values,\
                                         np.log(self.start['theta'])))
        else:
            x0 = np.hstack((self.start['count'].values,self.start['zero'].values))

        fitResult = sp.optimize.minimize(self.loglikfun, args=(-1.0,), x0 = x0, \
                                        method=self.method, jac=self.gradfun, options=self.optim_options)
        
        ## coefficients and covariances
        coefc = pd.Series(data = fitResult.x[0:self.kx], index = self.X.columns.values)
        coefz = pd.Series(data = fitResult.x[self.kx:self.kx+self.kz], index = self.Z.columns.values)

        if self.method == 'L-BFGS-B':
            vc_data = fitResult.hess_inv.todense()
        elif self.method == 'BFGS':
            vc_data = fitResult.hess_inv
        else:
            warnings.warn('Not tested for methods other than BFGS and L-BFGS-B.')
            
        vc = pd.DataFrame(data = vc_data[np.arange(self.kx+self.kz)[:,None],np.arange(self.kx+self.kz)], \
                      index = np.append(self.X.columns.values, self.Z.columns.values),\
                 columns = np.append(self.X.columns.values, self.Z.columns.values))
        if self.dist == 'negbin':
            ntheta = self.kx + self.kz
            theta = np.exp(fitResult.x[ntheta])
            SE_logtheta = np.sqrt(np.diagonal(vc_data)[ntheta])
        else:
            theta = None
            SE_logtheta = None
    
        ## fitted and residuals
        mu = np.exp(np.dot(self.X,coefc)+self.offsetx)
        phi = self.linkobj.link_inv(np.dot(self.Z,coefz)+self.offsetz)
        Yhat = (1-phi) * mu
        res = np.sqrt(self.weights) * (self.Y - Yhat)

        ## effective observations
        nobs = np.sum(self.weights > 0)
        
        Result = ZeroInflatedResults(self.call, self.formula, self.terms, self.kx, self.kz, \
                                     self.dist, self.link, self.linkobj, self.optim_options, self.method, self.start,\
                                     self.reltol, self.weights, self.offsetx, self.offsetz,\
                                     fitResult, coefc, coefz, theta, SE_logtheta, nobs, res, Yhat, vc, self.endog)

        return Result

        
        
    def set_start(self, start):
        if start is not None:
            valid = True
            if ('count' in start) is False:
                valid = False
                warnings.warn("invalid starting values, count model coefficients not specified")
                start['count'] = pd.Series(np.repeat(0,kx), index = X.columns.values)
            if ('zero' in start) is False:
                valid = False
                warnings.warn("invalid starting values, zero model coefficients not specified")
                start['zero'] = pd.Series(np.repeat(0,kz), index = Z.columns.values)
            if len(start['count']) != kx:
                valid = False
                warnings.warn("invalid starting values, wrong number of count model coefficients")
            if len(start['zero']) != kz:
                valid = False
                warnings.warn("invalid starting values, wrong number of zero model coefficients")
            if dist is 'negbin':
                if ('theta' in start) is False:
                    start['theta'] = 1.0
                start = {'zero':start['zero'], 'count':start['count'], 'theta' : (start['theta'][0]).astype(float)}
            else:
                start = {'zero':start['zero'], 'count':start['count']}    
        
            if valid is False:
                start = None

        if start is None:
            self.EM_estimate()
        else:
            self.start = start
        
     
    def set_tolerance(self, factr, reltol):
        if factr < 1.0:
            warnings.warn('Minimum value of factr is 1.0.')
            factr = 1.0
        if reltol is None:
            self.reltol = factr*(np.finfo(float).eps)**(1/1.6)
            
    @staticmethod    
    def formula_processing(formula_str, df, missing):
        # ToDo: Add 'missing' operations on df
        X_formula,Z_formula = formula_str.split("|")
        Z_formula = X_formula.split("~")[0]+" ~ "+ Z_formula
        y, X = patsy.dmatrices(X_formula, df, return_type='dataframe')
        Z = patsy.dmatrices(Z_formula, df, return_type='dataframe')[1]
        
        Y = np.squeeze(y)
        ## sanity checks
        if len(Y) < 1:
            sys.exit("empty model")
        if np.all(Y > 0):
            sys.exit("invalid dependent variable, minimum count is not zero")  
        if np.array_equal(np.asarray(Y), (np.round(Y + 0.001)).astype(int)) is False:
            sys.exit("invalid dependent variable, non-integer values")
        Y = (np.round(y + 0.001)).astype(int)
        if np.any(Y < 0):
            sys.exit("invalid dependent variable, negative counts")
            
        return y,X,Z
    
    @staticmethod
    def convert_params(mu, theta):
        """
        Convert mean/dispersion parameterization of a negative binomial to the ones scipy supports

        """
        n = theta
        p = theta/(theta+mu)
        return n, p
            
    
    @staticmethod
    def link_processing(link):
        ## binary link processing
        linkstr = link
        linkList = ['logit','probit','cauchit','cloglog','log']
        if linkstr not in linkList:
            warnings.warn(linkstr +" link not valid. Available links are: " + str(linkList))
            linkstr = 'logit'
        return(linkstr)
    
    
    @staticmethod
    def LinkClass_processing(linkstr):
        Link = {
            'logit': Logit(),
            'probit': Probit(),
            'cloglog': CLogLog(),
            'cauchit': Cauchit(),
            'log': Log(),
        }
        return Link.get(linkstr, Logit())
    
    @staticmethod
    def dist_processing(dist):
        if dist not in ['poisson','negbin','geom']:
            sys.exit(dist+" method not yet implemented")
        return dist
    

class ZeroInflatedResults(object):
    def __init__(self, call, formula, terms, kx, kz, dist, link, linkobj, options, method, start, reltol, \
                 weights, offsetx, offsetz, fitResult, coefc, coefz, theta, SE_logtheta,\
                nobs, res, Yhat, vc, y):
        
        # Need to change final results objects names to standard names used in python
        self.call = call
        self.formula = formula
        self.terms = terms
        self.kx = kx
        self.kz = kz
        self.n = self.nobs = nobs
        
        # fit paramters
        self.dist = dist
        self.linkstr = link
        self.link = linkobj.link
        self.linkinv = linkobj.link_inv
        self.optim_options = options
        self.method = method
        self.start = start
        self.reltol = reltol
                
        self.weights = weights 
        self.offsetx = offsetx
        self.offsetz = offsetz
        self.linkobj = linkobj
        
        # Optimization Results
        self.fit = fitResult
        self.loglik = fitResult.fun    # log-likelihood
        self.converged = fitResult.success
        self.iters = fitResult.nit;          # number of iterations for convergence
        self.coefficients = {'count':coefc ,'zero': coefz}
        self.theta = theta if (dist is 'negbin') else None
        self.SE_logtheta = SE_logtheta
        self.df_null = nobs - 2
        self.df_resid = nobs - (kx + kz + (dist == "negbin"))
        self.df_model = (kx + kz + (dist == "negbin"))
        self.residuals = res 
        self.fitted_values = Yhat
        self.vcov = vc
        self.y = y
        
        self.deviance = 0
        self.pearson_chi2 = 0
        self.cov_type = None
        self.use_t = False
        
    def __repr__(self):
        display_string = "Call:\n    "+self.call
        display_string += "\n\nformula:\n    "+self.formula
        #display_string += f"\nterms:\n    Y: {self.terms['Y']}\n    X: {self.terms['X']}\n    Z: {self.terms['Z']}"
        display_string += "\ndist: "+self.dist
        display_string += "\nlink: "+self.linkstr
        display_string += "\nlinkobj:"+ self.linkobj.__repr__()
        display_string += f"\nMessage: {self.fit.message}"
        display_string += f"\nResult: \n    Count:\n"
        display_string += f"{self.coefficients['count']}\n    Zero:\n{self.coefficients['zero']}"
        display_string += f"\n    theta: {self.theta:0.12f}" if self.dist is 'negbin' else " "
        display_string += f"\ndf_null: {self.df_null} \ndf_resid: {self.df_resid}"
        return display_string
        

In [201]:
### Lines below will not go inside the function
import numpy as np
import pandas as pd

df = pd.read_csv('DebTrivedi.csv',index_col = [0])
sel = np.array([1, 6, 7, 8, 13, 15, 18])-1
df = df.iloc[:,sel]
# produce design matrices from R-style formula
formula_str = 'ofp ~ hosp + health + numchron + gender + school + privins | health'

In [205]:
ZeroInflated(formula_str,df,dist='negbin').fit()



Call:
    ZeroInflated(formula_str='ofp ~ hosp + health + numchron + gender + school + privins | health', data=df, dist='negbin', offsetx=None, offsetz=None, link='logit', weights=None, missing='none')

formula:
    ofp ~ hosp + health + numchron + gender + school + privins | health
dist: negbin
link: logit
linkobj:
    linkstr: logit
    link: log(p/(1-p))
    linkinv: exp(eta)/(1+exp(eta))
Message: Desired error not necessarily achieved due to precision loss.
Result: 
    Count:
Intercept              0.941806418794
health[T.excellent]   -0.329441603901
health[T.poor]         0.328911164200
gender[T.male]        -0.124682075093
privins[T.yes]         0.218385934925
hosp                   0.222516202234
numchron               0.173664131382
school                 0.026673431572
dtype: float64
    Zero:
Intercept             -5.043170486579
health[T.excellent]    1.082341013667
health[T.poor]         1.619517731282
dtype: float64
    theta: 1.258759009264
df_null: 4404 
df_resid: 4394

In [203]:
ZeroInflated(formula_str,df,dist='negbin').fit().loglik



{'zero': Intercept             -2.478685285562
health[T.excellent]    0.592850481638
health[T.poor]        -0.411558591463
dtype: float64, 'count': Intercept              1.115004869647
health[T.excellent]   -0.297814884198
health[T.poor]         0.293726485015
gender[T.male]        -0.101386048384
privins[T.yes]         0.171004181664
hosp                   0.209377715697
numchron               0.149352001671
school                 0.023673497914
dtype: float64, 'theta': 0.87675523165440161}
{'zero': Intercept             -3.021563843131
health[T.excellent]    0.693003416743
health[T.poor]        -0.362490972893
dtype: float64, 'count': Intercept              1.033458412860
health[T.excellent]   -0.306035289731
health[T.poor]         0.300244263742
gender[T.male]        -0.112671552872
privins[T.yes]         0.194105360388
hosp                   0.215482041570
numchron               0.161449090985
school                 0.025052843469
dtype: float64, 'theta': 0.89698668264971781}
{'ze

{'zero': Intercept             -4.358032449060
health[T.excellent]    0.840792137407
health[T.poor]         1.037095245732
dtype: float64, 'count': Intercept              0.954865656984
health[T.excellent]   -0.326395365621
health[T.poor]         0.326518867991
gender[T.male]        -0.123177946880
privins[T.yes]         0.214792803952
hosp                   0.221706811056
numchron               0.171975682702
school                 0.026509464617
dtype: float64, 'theta': 1.3044490538914386}
{'zero': Intercept             -4.356723646632
health[T.excellent]    0.834796178414
health[T.poor]         1.037189778045
dtype: float64, 'count': Intercept              0.954879723321
health[T.excellent]   -0.326542541476
health[T.poor]         0.326555033270
gender[T.male]        -0.123172317826
privins[T.yes]         0.214784119383
hosp                   0.221708011500
numchron               0.171974148692
school                 0.026510247708
dtype: float64, 'theta': 1.3042946310489101}
{'zero

dtype: float64, 'theta': 1.3041686363328413}


12166.871805275301

In [None]:
start = { 'count': Intercept)      Xhealthpoor Xhealthexcellent      Xgendermale      Xprivinsyes            Xhosp 
      0.94205767       0.32883744      -0.32944920      -0.12465344       0.21831872       0.22249956 
       Xnumchron          Xschool      (Intercept)       healthpoor  healthexcellent 
      0.17363302       0.02667065      -5.02320729       1.60086945       1.06984137 

In [144]:
import scipy.stats as st


In [145]:
st.nbinom.pmf(k = 0,n= 10,p=0.4)
# same as dnbinom(0,size = 10,prob = 0.4) in R
# also: dnbinom(0,mu = 15,size = 10)

0.00010485760000000014

In [147]:
mean, var = st.nbinom.stats(n = 10, p = 0.4, moments='mv')
print('mu: ',mean,' var: ',var)

mu:  15.0  var:  37.5


In [149]:
n=10
p=0.4

In [152]:
n/(n+mean) # should equal p 

0.40000000000000002

In [153]:
1/(1+mean/n)

0.40000000000000002

In [None]:
p = 

In [168]:
(ZeroInflated(formula_str,df,dist='negbin').fit().start)



{'count': Intercept              0.954974381655
 health[T.excellent]   -0.327874794327
 health[T.poor]         0.326516269194
 gender[T.male]        -0.123146816220
 privins[T.yes]         0.214741701073
 hosp                   0.221704399479
 numchron               0.171967040735
 school                 0.026517080356
 dtype: float64,
 'theta': 1.3041686363328413,
 'zero': Intercept             -4.346081634494
 health[T.excellent]    0.779900134789
 health[T.poor]         1.028712964862
 dtype: float64}

In [165]:
np.diagonal(ZeroInflated(formula_str,df,dist='negbin').fit().fit.hess_inv)



array([  3.28990560e-03,   4.57821128e-03,   2.41702889e-03,
         9.60840811e-04,   1.54566043e-03,   4.72724969e-04,
         1.57140337e-04,   1.86544099e-05,   3.05118624e+00,
         5.08805188e+00,   2.83945492e+00,   1.95195105e-03])

Hessian inverse from R

(Intercept)      Xhealthpoor Xhealthexcellent      Xgendermale      Xprivinsyes            Xhosp 
    3.393448e-03     2.437295e-03     4.670310e-03     9.679333e-04     1.617482e-03     4.802313e-04 
Xnumchron          Xschool      (Intercept)       healthpoor  healthexcellent                  
    1.583909e-04     1.877204e-05     2.877106e+00     2.731460e+00     4.946743e+00     1.936406e-03 

In [169]:
(ZeroInflated(formula_str,df,dist='negbin').fit().start)



{'count': Intercept              0.954974381655
 health[T.excellent]   -0.327874794327
 health[T.poor]         0.326516269194
 gender[T.male]        -0.123146816220
 privins[T.yes]         0.214741701073
 hosp                   0.221704399479
 numchron               0.171967040735
 school                 0.026517080356
 dtype: float64,
 'theta': 1.3041686363328413,
 'zero': Intercept             -4.346081634494
 health[T.excellent]    0.779900134789
 health[T.poor]         1.028712964862
 dtype: float64}

In [34]:
X_formula = 'ofp ~ hosp + health + numchron + gender + school + privins'
y, X = patsy.dmatrices(X_formula, df, return_type='dataframe')
Z_formula = 'ofp ~ health'
Z = patsy.dmatrices(Z_formula, df, return_type='dataframe')[1]
model_count = sm.GLM(endog = y, exog = X, family = sm.families.Poisson()).fit()

In [35]:
model_count.params

Intercept              1.028874195080
health[T.excellent]   -0.361993201756
health[T.poor]         0.248306971386
gender[T.male]        -0.112319919691
privins[T.yes]         0.201686878072
hosp                   0.164797389210
numchron               0.146639282442
school                 0.026142990020
dtype: float64

In [10]:
model_count.summary2()

0,1,2,3
Model:,GLM,AIC:,35959.2256
Link Function:,log,BIC:,-13734.5914
Dependent Variable:,ofp,Log-Likelihood:,-17972.0
Date:,2018-04-05 22:40,LL-Null:,-19859.0
No. Observations:,4406,Deviance:,23168.0
Df Model:,7,Pearson chi2:,29500.0
Df Residuals:,4398,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,1.0289,0.0238,43.2575,0.0000,0.9823,1.0755
health[T.excellent],-0.3620,0.0303,-11.9452,0.0000,-0.4214,-0.3026
health[T.poor],0.2483,0.0178,13.9149,0.0000,0.2133,0.2833
gender[T.male],-0.1123,0.0129,-8.6765,0.0000,-0.1377,-0.0869
privins[T.yes],0.2017,0.0169,11.9624,0.0000,0.1686,0.2347
hosp,0.1648,0.0060,27.4782,0.0000,0.1530,0.1766
numchron,0.1466,0.0046,32.0194,0.0000,0.1377,0.1556
school,0.0261,0.0018,14.1824,0.0000,0.0225,0.0298
