In [124]:
import pandas as pd
import numpy as np
import os
import itertools

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from data import construct_hier

from folktables import ACSDataSource, ACSEmployment, ACSIncome, ACSPublicCoverage

In [125]:
# Load data and add label
cols = ["age", "sex", "bmi", "children", "smoker", "region", "charges"]
df = pd.read_csv('datasets/insurance/insurance.csv', header=0)
pd.to_numeric(df['charges'])
median = df['charges'].median()
df['label'] = df['charges'] >  median
X, y = df.drop(["charges", "label"], axis=1), df["label"]

In [126]:
# Get categorical and numerical features
cat_idx = X.select_dtypes(include=['object', 'bool']).columns
num_idx = X.select_dtypes(include=['int64', 'float64']).columns
steps = [('cat', OneHotEncoder(handle_unknown='ignore'), cat_idx), ('num', StandardScaler(), num_idx)]
col_transf = ColumnTransformer(steps)

In [127]:
# Group logic
ALL = [True] * y.shape[0]

young = np.array(X['age'] <= 35)
mid = np.array((X['age'] > 35) & (X['age'] <= 50))
old = np.array(X['age'] > 50)
age_group_names = ['Ya', 'Ma', 'Oa']
age_groups = [young, mid, old]

smoker = np.array(X['smoker'] == 'yes')
nonsmoker = np.array(X['smoker'] == 'no')
smoker_groups = [smoker, nonsmoker]
smoker_group_names = ['SMK', 'nSMK']

bmi1 = np.array(X['bmi'] < 18.5)
bmi2 = np.array((X['bmi'] >= 18.5) & (X['bmi'] < 25))
bmi3 = np.array((X['bmi']) >= 25 & (X['bmi'] < 30))
bmi4 = np.array(X['bmi'] >= 30)
bmi_groups = [bmi1, bmi2, bmi3, bmi4]
bmi_group_names = ['BMI1', 'BMI2', 'BMI3', 'BMI4']

sex_groups = [np.array(X['sex'] == 'male'), np.array(X['sex'] == 'female')]
sex_group_names = ['M', 'F']

groups, group_names, tree = construct_hier([[ALL], sex_groups, smoker_groups],
                                           [["ALL"], sex_group_names, smoker_group_names])

In [128]:
# label encoder to target variable so we have classes 0 and 1
assert(len(np.unique(y)) == 2)
X = col_transf.fit_transform(X)
y = LabelEncoder().fit_transform(y)

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
model.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
print(accuracy_score(model.predict(X_test), y_test))
print(accuracy_score(rf.predict(X_test), y_test))
print(accuracy_score(xgb.predict(X_test), y_test))

0.8992537313432836
0.9477611940298507
0.9440298507462687


In [132]:
splits = train_test_split(*tuple([X, y] + groups), 
                          test_size=0.15, random_state=0)
X_train = splits[0]
X_test = splits[1]
y_train = splits[2]
y_test = splits[3]
groups_train = splits[4::2]
groups_test = splits[5::2]

In [134]:
for g, group_name in enumerate(group_names):
    n_g = np.sum(groups_train[g])
    print("\tOn group={} with n={}...".format(group_name, n_g))
    
    model.fit(X_train[groups_train[g]], y[groups_train[g]])
    print(accuracy_score(model.predict(X_test[groups_test[g]]), y_test))

	On group=ALL with n=1137...


TypeError: fit() missing 1 required positional argument: 'y'