# Example using skGLMM with user's choice of R package.
## Here we show how to use mgcv's 'bam' by including it in the pacman_call: Generalized additive models for very large datasets

In [1]:
import numpy as np
import pandas as pd
from pyGLMM import skGLMM, r_formula

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# 1 millions rows
from sklearn import datasets
X, y = datasets.make_classification(n_samples=1000000, n_features=20,
                                    n_informative=2, n_redundant=2, random_state=1850)

df = pd.DataFrame(X)
df.columns = ['c_' + str(x) for x in df.columns]
df['target'] = y
df.head(10)

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,c_17,c_18,c_19,target
0,0.700125,0.106445,1.257848,-0.134727,-0.680739,-1.217811,-0.175122,3.00297,0.577351,-1.415132,0.880555,0.55406,0.277887,-0.267018,-3.111372,0.073659,0.246622,-2.959343,-0.816956,-2.134262,0
1,-0.471384,0.634204,1.036435,2.972805,-0.080241,0.981559,0.41301,-2.089999,-0.045795,-0.619267,0.456128,0.300446,1.396739,0.187688,1.526748,0.0571,-0.469754,1.073344,0.355364,-0.923485,1
2,1.494601,0.954379,0.001275,-1.317758,-1.270966,1.03919,-0.033103,2.166841,-0.759277,0.567719,-1.538831,0.693941,0.403915,0.177014,0.843286,1.261493,0.119349,0.102519,0.575865,0.910115,1
3,-2.028497,-0.320959,-2.015467,0.046022,-0.953663,0.634599,0.825898,-0.720998,0.31285,-1.448982,-0.360267,1.319939,-0.550612,0.084236,-0.335317,-1.595558,-0.224878,-1.074452,0.299488,-0.215651,1
4,-1.488295,-0.282133,-0.187441,-1.39846,0.343962,0.508631,0.456345,0.775781,-0.52408,-0.249795,0.301485,1.39884,-0.482793,0.133454,2.081031,2.067892,0.329529,2.28112,-0.472769,-0.055703,0
5,-1.675803,1.240753,0.244947,-0.299224,2.038644,-0.946322,-0.212069,0.589038,-0.774752,0.717753,0.988591,0.555439,0.217859,-0.13835,0.046179,-0.893931,0.449653,0.995318,-0.809802,-0.42025,0
6,0.630083,-0.651147,1.532352,0.703495,0.068582,-0.806652,0.326439,0.145004,-0.086705,0.196017,0.625547,0.2984,-0.504922,-0.120561,-0.054385,0.123724,1.631356,0.72305,1.281897,0.052186,0
7,0.809733,-0.326968,0.415401,-1.348572,-2.03609,1.274328,0.562726,-2.036836,0.371739,-0.290972,0.662643,-0.451154,-2.684935,0.210622,0.80445,-0.277072,-0.469033,-0.181384,0.421074,-1.545821,1
8,1.757239,0.192573,0.693025,-0.499118,0.424789,-0.526749,2.964763,0.192852,-1.728953,0.204041,1.294847,-0.371531,-2.729304,-0.136768,-2.103867,0.380674,-2.083444,-2.293785,1.431935,-0.087668,1
9,-0.437248,-0.755044,-0.680501,-1.669969,0.019313,-1.330387,0.173975,0.341089,1.860428,-1.323776,-0.532552,-0.610956,0.296444,-0.209215,-0.459498,0.114466,-0.789112,0.697981,0.866204,-1.171598,0


In [3]:
form = r_formula(df, dependent_var='target')
call = "bam(formula, data=df, family=binomial())"
call = call.replace('formula', form)
call

'bam(target ~ c_0 + c_1 + c_2 + c_3 + c_4 + c_5 + c_6 + c_7 + c_8 + c_9 + c_10 + c_11 + c_12 + c_13 + c_14 + c_15 + c_16 + c_17 + c_18 + c_19, data=df, family=binomial())'

In [4]:
Xcols = [x for x in df.columns if x not in ['target']]
ml = skGLMM(r_call = call, pacman_call = 'pacman::p_load(mgcv)')
ml.fit(df[Xcols], df['target'])

Starting Fit.

Family:
 
binomial
 


Link function:
 
logit
 



Formula:

target ~ c_0 + c_1 + c_2 + c_3 + c_4 + c_5 + c_6 + c_7 + c_8 + 

    c_9 + c_10 + c_11 + c_12 + c_13 + c_14 + c_15 + c_16 + c_17 + 

    c_18 + c_19


Parametric coefficients:

           
   Estimate
 Std. Error
  z value
 Pr(>|z|)
    

(Intercept)
  0.1374815
  0.0042710
   32.190
   <2e-16
 ***

c_0        
  0.0022528
  0.0041948
    0.537
   0.5912
    

c_1        
  0.0007237
  0.0041987
    0.172
   0.8631
    

c_2        
 -0.0018894
  0.0042014
   -0.450
   0.6529
    

c_3        
 -0.0067451
  0.0042003
   -1.606
   0.1083
    

c_4        
 -0.0007197
  0.0041893
   -0.172
   0.8636
    

c_5        
  5.8501952
  0.0127799
  457.767
   <2e-16
 ***

c_6        
 -0.0001114
  0.0041937
   -0.027
   0.9788
    

c_7        
 -0.0038835
  0.0041965
   -0.925
   0.3548
    

c_8        
  0.0013960
  0.0041948
    0.333
   0.7393
    

c_9        
 -0.0024008
  0.0041939
   -0.572
   0.5670
    

c_1

In [5]:
ml.summary()


Family: binomial 
Link function: logit 

Formula:
target ~ c_0 + c_1 + c_2 + c_3 + c_4 + c_5 + c_6 + c_7 + c_8 + 
    c_9 + c_10 + c_11 + c_12 + c_13 + c_14 + c_15 + c_16 + c_17 + 
    c_18 + c_19

Parametric coefficients:
              Estimate Std. Error  z value Pr(>|z|)    
(Intercept)  0.1374815  0.0042710   32.190   <2e-16 ***
c_0          0.0022528  0.0041948    0.537   0.5912    
c_1          0.0007237  0.0041987    0.172   0.8631    
c_2         -0.0018894  0.0042014   -0.450   0.6529    
c_3         -0.0067451  0.0042003   -1.606   0.1083    
c_4         -0.0007197  0.0041893   -0.172   0.8636    
c_5          5.8501952  0.0127799  457.767   <2e-16 ***
c_6         -0.0001114  0.0041937   -0.027   0.9788    
c_7         -0.0038835  0.0041965   -0.925   0.3548    
c_8          0.0013960  0.0041948    0.333   0.7393    
c_9         -0.0024008  0.0041939   -0.572   0.5670    
c_10        -0.0003603  0.0041982   -0.086   0.9316    
c_11        -0.0022768  0.0041954   -0.543   0.5

In [6]:
ml.score(df[Xcols], df['target'])

-0.22327507803160063

In [7]:
phat = ml.predict(df[Xcols], parallel=True)
phat.shape

  |                                                              |   0%, ETA NA




(1000000, 1)