In [2]:
import math

import fcalc
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Binarized data

In [3]:
df = pd.read_csv("data/DrinkersBodySignals.csv").drop(["SMK_stat_type_cd"], axis=1)
df["DRK_YN"] = df["DRK_YN"].map(lambda val: val == "Y")
column_names = df.columns.values
df.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,48.0,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,True
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,55.0,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,False
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,41.0,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,False
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,76.0,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,False
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,61.0,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,False


In [4]:
df["bmi"] = (df["weight"].to_numpy() * 10000) / np.square(df["height"].to_numpy())
df["sex"] = df["sex"].map(lambda val: val == "Male")

for age_group in range(0, 10):
    df[f"age_{age_group}"] = df["age"].map(
        lambda val: math.floor(val / 10) == age_group
    )
    
for column in [
    "waistline",
    "sight_left",
    "sight_right",
    "hear_left",
    "hear_right",
    "SBP",
    "DBP",
    "BLDS",
    "tot_chole",
    "HDL_chole",
    "LDL_chole",
    "triglyceride",
    "hemoglobin",
    "urine_protein",
    "serum_creatinine",
    "SGOT_AST",
    "SGOT_ALT",
    "gamma_GTP",
    "bmi"
]:
    mean = df[column].mean()
    df[column] = df[column].map(lambda val: val > mean)
    
X = df.drop(["age", "height", "weight", "DRK_YN"], axis=1)
y = df["DRK_YN"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X

Unnamed: 0,sex,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,...,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9
0,True,True,True,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
1,True,True,False,True,False,False,True,True,True,True,...,False,False,False,True,False,False,False,False,False,False
2,True,True,True,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,True,True,True,True,False,False,True,True,False,True,...,False,False,False,False,False,True,False,False,False,False
4,True,False,True,True,False,False,True,True,True,True,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,True,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
1496,True,False,True,True,False,False,True,True,False,True,...,False,False,False,False,False,True,False,False,False,False
1497,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
1498,True,True,True,False,False,False,True,True,True,True,...,False,False,False,False,True,False,False,False,False,False


In [5]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(
    X_train.values, y_train.to_numpy(), method="standard", alpha=0.1
)

In [6]:
bin_cls.predict(X_test.values)

In [7]:
print(accuracy_score(y_test, bin_cls.predictions))
print(f1_score(y_test, bin_cls.predictions, average="weighted"))

0.5644444444444444
0.4431481481481481


# Pattern structures

In [8]:
df = pd.read_csv("data/DrinkersBodySignals.csv").drop(["SMK_stat_type_cd"], axis=1)
df["DRK_YN"] = df["DRK_YN"].map(lambda val: val == "Y")
column_names = df.columns.values
df.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,48.0,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,True
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,55.0,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,False
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,41.0,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,False
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,76.0,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,False
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,61.0,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,False


In [9]:
df["bmi"] = (df["weight"].to_numpy() * 10000) / np.square(df["height"].to_numpy())
X = df.drop(["height", "weight", "DRK_YN"], axis=1)
y = df["DRK_YN"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(
    X_train.values,
    y_train.to_numpy(),
    categorical=np.arange(X_train.shape[1]),
    method="standard",
    alpha=0.1,
)

In [11]:
pat_cls.predict(X_test.values)

In [12]:
print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.7244444444444444
0.75
