In [1]:
import warnings
import os
os.environ["OMP_NUM_THREADS"] = "6"
# warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak*")
# warnings.filterwarnings("ignore", message="Solution may be inaccurate*")

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

import stratifreg
from stratifreg.two_groups import Joint2Regressor
from stratifreg.k_groups import JointKRegressor
from stratifreg.gmm_groups import Joint2GMMRegressor
from stratifreg.utils import JointUtils

In [3]:
def polynomial_tramsform(X, degree=2, include_bias=False):
    """
    Transforme un X (n, p) en toutes les colonnes de polynômes croisés jusqu'à 'degree'
    Exemple : degree=2 → 1, x1, x2, ..., x1^2, x2^2, x1*x2, etc.
    """
    poly = PolynomialFeatures(degree=degree, include_bias=include_bias)
    X_poly       = poly.fit_transform(X)
    colname_poly = poly.get_feature_names_out(X.columns)
    return X_poly, colname_poly

def get_data(path_X,path_y,scale=False,degree=0):
    scaler = StandardScaler()
    X = pd.read_csv(path_X)
    y = pd.read_csv(path_y)
    columnsXpoly = X.columns
    if degree>1: 
        X_poly,columnsXpoly = polynomial_tramsform(X,degree,False)
        X = X_poly
    X = JointUtils._as_numpy(X)
    y = JointUtils._as_numpy(y)
    y_med  = np.median(y)
    X  = pd.DataFrame(scaler.fit_transform(X), columns=columnsXpoly)
    X1,X2,y1,y2  = JointUtils.split_by_median(X,y, group_mode='median')
    if scale:
        X = scaler.fit_transform(X)
        #X1 = scaler.fit_transform(X1)
        #X2 = scaler.fit_transform(X2)
    Xc          = JointUtils.add_intercept(X)
    X1c         = JointUtils.add_intercept(X1)
    X2c         = JointUtils.add_intercept(X2)
    return X,Xc,y,X1,X1c,y1,X2,X2c,y2,y_med,columnsXpoly

X,Xc,y,X1,X1c,y1,X2,X2c,y2,y_med,varnames = get_data('./datasets/Xf_all_pisa2009.csv',
                                                     './datasets/yf_all_pisa2009.csv')
print(Xc.shape,X1c.shape,X2c.shape,y.shape,y1.shape,y2.shape)

(5233, 21) (2617, 21) (2616, 21) (5233,) (2617,) (2616,)


In [4]:
reg = Joint2Regressor()
[beta1, beta2], var_beta, sigma2s = reg.fit_ols_groups(X1c, X2c, y1, y2, sigma_mode='two')
print(Joint2Regressor.display(reg, varnames,"beta"))

                        beta_G1   beta_G2
intercept              429.1046  567.0030
grade                   11.0405    4.5989
male                    -5.8751   -1.1844
preschool               -0.6471    0.9720
expectBachelors          8.4561    8.7844
motherHS                 1.5719    3.5218
motherWork              -0.2293   -1.0314
fatherHS                -1.1754    2.6983
fatherWork               2.4267    1.1416
selfBornUS               0.9349   -0.4593
motherBornUS            -3.4363   -0.8055
fatherBornUS             4.4797    0.7524
englishAtHome            0.2141    2.8923
computerForSchoolwork    2.6771    5.2545
read30MinsADay           6.9398    7.1604
minutesPerWeekEnglish    2.9924    0.8825
studentsInEnglish        1.4447   -1.4161
schoolHasLibrary         0.6375   -1.2080
publicSchool            -5.0229   -4.4337
urban                   -8.0699   -3.4086
schoolSize               4.9003    4.6376


In [5]:
kreg = JointKRegressor()
kreg.fit([(X1c,y1),(X2c,y2)], joint_X_list=None, loss='quadratic', 
         tau=0.0, l1=0., l2=0., weights_list=None)
print(JointKRegressor.display(kreg,varnames))

                       model_G1  model_G2
intercept              429.1046  567.0030
grade                   11.0405    4.5989
male                    -5.8751   -1.1844
preschool               -0.6471    0.9720
expectBachelors          8.4561    8.7844
motherHS                 1.5719    3.5218
motherWork              -0.2293   -1.0314
fatherHS                -1.1754    2.6983
fatherWork               2.4267    1.1416
selfBornUS               0.9349   -0.4593
motherBornUS            -3.4363   -0.8055
fatherBornUS             4.4797    0.7524
englishAtHome            0.2141    2.8923
computerForSchoolwork    2.6771    5.2545
read30MinsADay           6.9398    7.1604
minutesPerWeekEnglish    2.9924    0.8825
studentsInEnglish        1.4447   -1.4161
schoolHasLibrary         0.6375   -1.2080
publicSchool            -5.0229   -4.4337
urban                   -8.0699   -3.4086
schoolSize               4.9003    4.6376


In [6]:
gmmreg = Joint2GMMRegressor()
gmmreg.fit(X1c, X2c, y1, y2, x0=None, m1=1, m2=1, max_iter=10)
print(Joint2Regressor.display(reg, varnames,"beta"))

                        beta_G1   beta_G2
intercept              429.1046  567.0030
grade                   11.0405    4.5989
male                    -5.8751   -1.1844
preschool               -0.6471    0.9720
expectBachelors          8.4561    8.7844
motherHS                 1.5719    3.5218
motherWork              -0.2293   -1.0314
fatherHS                -1.1754    2.6983
fatherWork               2.4267    1.1416
selfBornUS               0.9349   -0.4593
motherBornUS            -3.4363   -0.8055
fatherBornUS             4.4797    0.7524
englishAtHome            0.2141    2.8923
computerForSchoolwork    2.6771    5.2545
read30MinsADay           6.9398    7.1604
minutesPerWeekEnglish    2.9924    0.8825
studentsInEnglish        1.4447   -1.4161
schoolHasLibrary         0.6375   -1.2080
publicSchool            -5.0229   -4.4337
urban                   -8.0699   -3.4086
schoolSize               4.9003    4.6376


In [7]:
kreg = JointKRegressor()
kreg.fit([(X1c,y1),(X2c,y2)], joint_X_list=None, loss='quantile', 
                   tau=0.5, l1=0., l2=0., weights_list=None)
print(JointKRegressor.display(kreg, varnames,"beta"))

                        beta_G1   beta_G2
intercept              437.0845  559.4569
grade                   11.7695    5.4426
male                    -5.7821    0.6490
preschool               -1.1196    1.1562
expectBachelors          9.3885    7.0089
motherHS                 1.5008    3.5598
motherWork               0.5145   -1.6296
fatherHS                 0.2979    2.7998
fatherWork               1.8221    1.4218
selfBornUS               0.2281   -1.0654
motherBornUS            -3.7267    1.9931
fatherBornUS             4.3446    0.8473
englishAtHome            1.6464    2.1613
computerForSchoolwork    3.1679    4.1352
read30MinsADay           7.2324    8.9806
minutesPerWeekEnglish    3.9388    0.3497
studentsInEnglish        1.9966    0.1433
schoolHasLibrary         0.3213    0.1706
publicSchool            -4.4313   -4.5537
urban                   -8.9701   -4.0807
schoolSize               4.1590    3.9762




In [8]:
x0_c  = JointUtils.find_x0(Xc, y, y_med)
x0_LL = JointUtils.find_x0_LL(Xc, y, y_med, L=1)
regctr_x0c  = Joint2Regressor()
resu_x0c    = regctr_x0c.fit_ols_jointure_a_b(X1c, X2c, y1, y2, x0_c, 
                                  y0=None, sigma_mode='one', cas='a')
regctr_x0LL = Joint2Regressor()
resu_x0LL   = regctr_x0LL.fit_ols_jointure_a_b(X1c, X2c, y1, y2, x0_LL, 
                                  y0=None, sigma_mode='one', cas='a')
print(Joint2Regressor.display(regctr_x0c, varnames,"beta_x0_c"))
print(Joint2Regressor.display(regctr_x0LL, varnames,"beta_x0_LL"))

                       beta_x0_c_G1  beta_x0_c_G2
intercept                  438.8995      559.5514
grade                       11.4696        7.5060
male                       -16.6517        8.0844
preschool                    3.1846       -3.5623
expectBachelors             13.9412        5.0502
motherHS                     4.5632        1.8536
motherWork                 -15.3244       15.1007
fatherHS                    -0.6980        0.0647
fatherWork                   6.3868       -1.6806
selfBornUS                   1.3851       -1.5738
motherBornUS                -2.2995       -1.6334
fatherBornUS                 6.2653       -0.4415
englishAtHome               -0.1276        3.3157
computerForSchoolwork        5.1355        2.8832
read30MinsADay              -1.2981       14.7178
minutesPerWeekEnglish        5.3840       -1.4252
studentsInEnglish           -3.4148        3.3783
schoolHasLibrary             1.5168       -2.5880
publicSchool                -3.1460       -7.4924


In [9]:
gmmreg = Joint2GMMRegressor()
gmmreg.fit(X1c, X2c, y1, y2, x0=x0_c, m1=1, m2=1,max_iter=10)
print(Joint2GMMRegressor.display(gmmreg,varnames,"beta"))

                       beta_G1_C1  beta_G2_C1
intercept                438.8975    559.5485
grade                     11.4694      7.5066
male                     -16.6516      8.0843
preschool                  3.1847     -3.5623
expectBachelors           13.9409      5.0513
motherHS                   4.5632      1.8537
motherWork               -15.3244     15.1007
fatherHS                  -0.6980      0.0649
fatherWork                 6.3867     -1.6805
selfBornUS                 1.3851     -1.5738
motherBornUS              -2.2993     -1.6334
fatherBornUS               6.2652     -0.4415
englishAtHome             -0.1277      3.3158
computerForSchoolwork      5.1354      2.8836
read30MinsADay            -1.2986     14.7181
minutesPerWeekEnglish      5.3840     -1.4251
studentsInEnglish         -3.4148      3.3784
schoolHasLibrary           1.5168     -2.5880
publicSchool              -3.1455     -7.4926
urban                    -11.2206     -1.9876
schoolSize                 2.8391 

In [10]:
gmmreg = Joint2GMMRegressor()
gmmreg.fit(X1c, X2c, y1, y2, x0=x0_c, m1=2, m2=2,max_iter=10)
print(Joint2GMMRegressor.display(gmmreg,varnames,"beta"))

                       beta_G1_C1  beta_G1_C2  beta_G2_C1  beta_G2_C2
intercept                448.5377    431.0647    561.2821    556.7912
grade                      7.2766     14.2322      9.6897      1.3713
male                     -15.4754    -17.0786      5.5329     12.5421
preschool                  5.7379      1.5769     -0.9835     -8.7211
expectBachelors            8.4855     17.3604      6.7643      1.3507
motherHS                   2.8906      5.6842      3.6514     -2.3095
motherWork               -22.6686    -11.2553      8.3963     29.9581
fatherHS                   1.2346     -2.4681      0.7897     -1.7047
fatherWork                 2.2955      8.6699      1.0813     -7.9313
selfBornUS                -8.6964      8.9909     -4.1359      5.3361
motherBornUS               1.4351     -4.6866      0.1749     -5.8109
fatherBornUS               3.4190      7.4635     -1.6587      3.0433
englishAtHome              6.7717     -4.5939      4.0472      0.3341
computerForSchoolwor

In [11]:
kreg = JointKRegressor()
group_list = [(X1c,y1),(X2c,y2)]
kreg.fit(group_list, joint_X_list=None, loss='quadratic', 
         tau=0.5, l1=0.0, l2=0.0, weights_list=None)
print(JointKRegressor.display(kreg,varnames,"beta"))

                        beta_G1   beta_G2
intercept              429.1046  567.0030
grade                   11.0405    4.5989
male                    -5.8751   -1.1844
preschool               -0.6471    0.9720
expectBachelors          8.4561    8.7844
motherHS                 1.5719    3.5218
motherWork              -0.2293   -1.0314
fatherHS                -1.1754    2.6983
fatherWork               2.4267    1.1416
selfBornUS               0.9349   -0.4593
motherBornUS            -3.4363   -0.8055
fatherBornUS             4.4797    0.7524
englishAtHome            0.2141    2.8923
computerForSchoolwork    2.6771    5.2545
read30MinsADay           6.9398    7.1604
minutesPerWeekEnglish    2.9924    0.8825
studentsInEnglish        1.4447   -1.4161
schoolHasLibrary         0.6375   -1.2080
publicSchool            -5.0229   -4.4337
urban                   -8.0699   -3.4086
schoolSize               4.9003    4.6376


In [12]:
kreg = JointKRegressor()
kreg.fit([(X1c,y1),(X2c,y2)], joint_X_list=[x0_c], loss='quadratic', 
         tau=0.5, l1=0.0, l2=0.0, weights_list=None) 
print(JointKRegressor.display(kreg,varnames))

                       model_G1  model_G2
intercept              438.8995  559.5514
grade                   11.4696    7.5060
male                   -16.6517    8.0844
preschool                3.1846   -3.5623
expectBachelors         13.9412    5.0502
motherHS                 4.5632    1.8536
motherWork             -15.3244   15.1007
fatherHS                -0.6980    0.0647
fatherWork               6.3868   -1.6806
selfBornUS               1.3851   -1.5738
motherBornUS            -2.2995   -1.6334
fatherBornUS             6.2653   -0.4415
englishAtHome           -0.1276    3.3157
computerForSchoolwork    5.1355    2.8832
read30MinsADay          -1.2981   14.7178
minutesPerWeekEnglish    5.3840   -1.4252
studentsInEnglish       -3.4148    3.3783
schoolHasLibrary         1.5168   -2.5880
publicSchool            -3.1460   -7.4924
urban                  -11.2209   -1.9874
schoolSize               2.8394    8.7185


In [13]:
reg = Joint2Regressor()
betas, var_beta, sigma2s = reg.fit_ols_jointure_smoothed(X1c, X2c, y1, y2, x0_c, lc=10.0)
print(Joint2Regressor.display(reg, varnames,"beta_S_x0_"))

                       beta_S_x0__G1  beta_S_x0__G2
intercept                   429.7152       566.5385
grade                        11.0673         4.7801
male                         -6.5469        -0.6065
preschool                    -0.4082         0.6893
expectBachelors               8.7981         8.5516
motherHS                      1.7584         3.4178
motherWork                   -1.1704        -0.0257
fatherHS                     -1.1457         2.5341
fatherWork                    2.6736         0.9656
selfBornUS                    0.9629        -0.5288
motherBornUS                 -3.3654        -0.8571
fatherBornUS                  4.5911         0.6780
englishAtHome                 0.1928         2.9187
computerForSchoolwork         2.8304         5.1067
read30MinsADay                6.4262         7.6316
minutesPerWeekEnglish         3.1415         0.7387
studentsInEnglish             1.1417        -1.1172
schoolHasLibrary              0.6923        -1.2941
publicSchool

In [14]:
kreg = JointKRegressor()
#X1c1,X1c2,y11,y12  = JointUtils.split_by_median(X1c,y1, group_mode='median')
X2c1,X2c2,y21,y22  = JointUtils.split_by_median(X2c,y2, group_mode='median')
jl = [x0_c,JointUtils.find_x0(X2c, y2, np.median(y2))]             
betas = kreg.fit([(X1c,y1),(X2c1,y21),(X2c2,y22)], jl, loss='quantile', tau=0.5, l1=0.9, l2=0.37)
print(JointKRegressor.display(kreg,varnames,"beta"))

                        beta_G1   beta_G2   beta_G3
intercept              421.9505  499.7697  467.9224
grade                   11.7152    6.2277   19.7314
male                   -11.4010    1.0222    3.6255
preschool                1.9114    0.7215   -4.6998
expectBachelors          9.9677   11.5515  122.0057
motherHS                 3.8498    0.4254   28.8220
motherWork              -7.0024   20.4760   -5.2160
fatherHS                -1.0503    0.2344   28.3146
fatherWork               3.7750    1.5969   -0.6216
selfBornUS               0.9638   -1.2560   -1.0333
motherBornUS            -2.3970   -0.0229    1.2182
fatherBornUS             3.8383   -0.0803   -3.0587
englishAtHome           -0.1652    1.6295   11.8564
computerForSchoolwork    2.4054    4.9380   48.2035
read30MinsADay          -1.3108   15.5170    5.2641
minutesPerWeekEnglish    4.6316   -4.2186   13.3267
studentsInEnglish       -1.1330    1.3142    6.2105
schoolHasLibrary         1.3290    0.0935   -2.9074
publicSchool