In [1]:
import sys

import matplotlib.pyplot as plt
import pandas as pd
import pandera as pa
import seaborn as sns

sys.path.append("../../src")
from datahandler import DataHandler

sns.set_theme()

In [2]:
data_dir = DataHandler.DATA_DIR
feat_df = pd.read_csv(data_dir / "processed_data.csv")

In [3]:
USED_INDEXES = [1, 2, 3]
feat_df = feat_df[feat_df["sex"] == "female"]
feat_df = feat_df[feat_df["number"].isin(USED_INDEXES)]

In [4]:
feature_cols = [
    "AR", "STD_SPEED", "STDP", "VOL_VAR_GLOBAL", "F1F2R_100",
    "AVG_SILENCE_DURATION_SENTENCE",
]

target_col = "is_proficient"

feat_df = feat_df[[*feature_cols, target_col]]

schema = pa.DataFrameSchema({
    "AR": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "STD_SPEED": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "STDP": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "VOL_VAR_GLOBAL": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "F1F2R_100": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "AVG_SILENCE_DURATION_SENTENCE": pa.Column(pa.Float, pa.Check(lambda s: s > 0)),
    "is_proficient": pa.Column(pa.Bool),
})

feat_df = schema.validate(feat_df)

In [5]:
import rpy2.robjects as ro
import rpy2.robjects.packages as rpackages
from rpy2.robjects import Formula, pandas2ri, r

In [6]:
utils = rpackages.importr('utils')
utils.install_packages('logistf')

R[write to console]: Installing package into ‘/home/vscode/R/aarch64-unknown-linux-gnu-library/4.2’
(as ‘lib’ is unspecified)

R[write to console]: trying URL 'https://cloud.r-project.org/src/contrib/logistf_1.26.0.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 76220 bytes (74 KB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[wr

gcc -I"/usr/share/R/include" -DNDEBUG      -fpic  -g -O2 -ffile-prefix-map=/build/r-base-85GD2O/r-base-4.2.2.20221110=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c init.c -o init.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fpic  -g -O2 -ffile-prefix-map=/build/r-base-85GD2O/r-base-4.2.2.20221110=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c logistf.c -o logistf.o
gcc -shared -L/usr/lib/R/lib -Wl,-z,relro -o logistf.so init.o logistf.o -L/usr/lib/R/lib -lR


installing to /home/vscode/R/aarch64-unknown-linux-gnu-library/4.2/00LOCK-logistf/00new/logistf/libs
** R
** data
*** moving datasets to lazyload DB
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (logistf)
R[write to console]: 

R[write to console]: 
R[write to console]: The downloaded source packages are in
	‘/tmp/Rtmpnieu6u/downloaded_packages’
R[write to console]: 
R[write to console]: 



<rpy2.rinterface_lib.sexp.NULLType object at 0xfffef4df7e00> [RTYPES.NILSXP]

In [7]:
from sklearn.discriminant_analysis import StandardScaler

pandas2ri.activate()

logistf = rpackages.importr('logistf')
formula = Formula(f"as.factor({target_col}) ~ {' + '.join(feature_cols)}")

scaler = StandardScaler()
scaler.fit(feat_df[feature_cols])
X = scaler.transform(feat_df[feature_cols])
y = feat_df[target_col].astype(int).to_numpy()

scaled_df = pd.DataFrame(X, columns=feature_cols)
scaled_df[target_col] = y

with (ro.default_converter + pandas2ri.converter).context():
    r_df = pandas2ri.py2rpy(scaled_df)

logistf_model = logistf.logistf(formula=formula, data=r_df)
summary = r['summary'](logistf_model) # type: ignore  # noqa: PGH003

(function (formula, data, pl = TRUE, alpha = 0.05, control, plcontrol, 
    modcontrol, firth = TRUE, init, weights, na.action, offset, 
    plconf = NULL, flic = FALSE, model = TRUE, ...) 
{
    call <- match.call()
    if (missing(control)) 
        control <- logistf.control()
    if (pl == TRUE & missing(plcontrol)) 
        plcontrol <- logistpl.control()
    if (missing(modcontrol)) 
        modcontrol <- logistf.mod.control()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "weights", "na.action", "offset"), 
        names(mf), 0L)
    mf <- mf[c(1, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    mt <- attr(mf, "terms")
    y <- model.response(mf, type = "any")
    if (is.logical(y)) {
        y <- as.numeric(y)
    }
    else if (is.factor(y)) {
        if (length(levels(y)) == 2) {
            y <- as.numeric(y != levels(y)[1L])
        }
    }
    else if (!is.numeric(y)) {
   