In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [7]:
df = pd.read_csv('./preprocessed.csv')
df.drop(['umich_user_id', 'Unnamed: 0'], axis=1, inplace=True)

In [8]:
X = df.drop('completed', axis=1)
y = df['completed']

In [138]:
def calculate_fp_rates(X, y, data):
    col_data = data.copy()
    model = LogisticRegression(verbose=0)

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)

    for i,d in enumerate(col_data):
        col_data[i] = (X.columns.get_loc(d[0]), d[1])

    aucs = []
    fprs = []

    y = np.array(y)
    X = np.array(X)
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        conditions = np.array([(X_test[:, name] == val) for name, val in col_data]).all(axis=0)
        X_test_filtered = X_test[conditions]
        y_test_filtered = y_test[conditions]
        model.fit(X_train, y_train)
        y_pred_filtered = None
        auc = None
        try:
            y_pred_filtered = model.predict(X_test_filtered)
            auc = roc_auc_score(y_test_filtered, y_pred_filtered)

        except Exception:
            print(X_test_filtered.shape, y_test_filtered.shape)
            continue

        y_pred = model.predict(X_test)
        
        aucs.append(auc)

        # TP = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 1))
        # FN = np.sum((y_test[conditions] == 1) & (y_pred[conditions] == 0))
        TN = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 0))
        FP = np.sum((y_test[conditions] == 0) & (y_pred[conditions] == 1))      

        # Calculate FPR
        FPR = FP / (FP + TN)
        fprs.append(FPR)
        # print("False Positive Rate:", FPR)

    auc_avg = np.average(aucs)
    auc_std = np.std(aucs)
    fpr_avg = np.average(fprs)
    fpr_std = np.std(fprs)
    print(f"AUC: {auc_avg} +/- {auc_std}")
    print(f"FPR: {fpr_avg} +/- {fpr_std}")

In [110]:
calculate_fp_rates(X, y, [("reported_or_inferred_gender_male", 1), ("bachelor_obtained", 1)])

AUC: 0.7998015873015872 +/- 0.12736008952426234
FPR: 0.11428571428571428 +/- 0.16272551071277883


In [107]:
df.head()

Unnamed: 0,answer_count,average_answer_length,total_votes_given_answers,total_votes_received_answers,question_count,average_question_length,total_votes_given_questions,total_votes_received_questions,total_questions_following,country_cd_US,reported_or_inferred_gender_male,bachelor_obtained,completed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0
2,10.0,797.2,0.0,0.0,0.0,797.2,0.0,0.0,0.0,1,0,0,1
3,5.0,589.0,1.0,1.0,1.0,589.0,0.0,4.0,1.0,1,0,1,1
4,11.0,300.818182,3.0,3.0,0.0,300.818182,0.0,3.0,0.0,1,0,0,1


In [71]:
def compute_combo(cols):
    retval = []
    def recurse(i, curr):
        for j in range(i, len(cols)):
            opt1 = curr.copy()
            opt1.append((cols[j], 0))

            opt2 = curr.copy()
            opt2.append((cols[j], 1))
            retval.append(opt1)
            retval.append(opt2)
            if j < len(cols) - 1:
                recurse(j+1, opt1.copy())
            if j < len(cols) - 1:
                recurse(j+1, opt2.copy())


    recurse(0, [])

    return retval




In [111]:
combos = compute_combo(['country_cd_US', 'reported_or_inferred_gender_male', 'bachelor_obtained'])

In [112]:
for c in combos:
    if len(c) == 2:
        print(c)

[('country_cd_US', 0), ('reported_or_inferred_gender_male', 0)]
[('country_cd_US', 0), ('reported_or_inferred_gender_male', 1)]
[('country_cd_US', 0), ('bachelor_obtained', 0)]
[('country_cd_US', 0), ('bachelor_obtained', 1)]
[('country_cd_US', 1), ('reported_or_inferred_gender_male', 0)]
[('country_cd_US', 1), ('reported_or_inferred_gender_male', 1)]
[('country_cd_US', 1), ('bachelor_obtained', 0)]
[('country_cd_US', 1), ('bachelor_obtained', 1)]
[('reported_or_inferred_gender_male', 0), ('bachelor_obtained', 0)]
[('reported_or_inferred_gender_male', 0), ('bachelor_obtained', 1)]
[('reported_or_inferred_gender_male', 1), ('bachelor_obtained', 0)]
[('reported_or_inferred_gender_male', 1), ('bachelor_obtained', 1)]


In [113]:
combos

[[('country_cd_US', 0)],
 [('country_cd_US', 1)],
 [('country_cd_US', 0), ('reported_or_inferred_gender_male', 0)],
 [('country_cd_US', 0), ('reported_or_inferred_gender_male', 1)],
 [('country_cd_US', 0),
  ('reported_or_inferred_gender_male', 0),
  ('bachelor_obtained', 0)],
 [('country_cd_US', 0),
  ('reported_or_inferred_gender_male', 0),
  ('bachelor_obtained', 1)],
 [('country_cd_US', 0),
  ('reported_or_inferred_gender_male', 1),
  ('bachelor_obtained', 0)],
 [('country_cd_US', 0),
  ('reported_or_inferred_gender_male', 1),
  ('bachelor_obtained', 1)],
 [('country_cd_US', 0), ('bachelor_obtained', 0)],
 [('country_cd_US', 0), ('bachelor_obtained', 1)],
 [('country_cd_US', 1), ('reported_or_inferred_gender_male', 0)],
 [('country_cd_US', 1), ('reported_or_inferred_gender_male', 1)],
 [('country_cd_US', 1),
  ('reported_or_inferred_gender_male', 0),
  ('bachelor_obtained', 0)],
 [('country_cd_US', 1),
  ('reported_or_inferred_gender_male', 0),
  ('bachelor_obtained', 1)],
 [('coun

In [140]:
for c in combos: 
    calculate_fp_rates(X, y, c)
    
    

AUC: 0.8459686147186147 +/- 0.0866911464709904
FPR: 0.06734848484848485 +/- 0.08432226568563089
AUC: 0.7762953140879285 +/- 0.04149390938579361
FPR: 0.16284505038476774 +/- 0.06004671828468712
AUC: 0.8020238095238096 +/- 0.13749876313605727
FPR: 0.07928571428571429 +/- 0.10002295654865784
(1, 12) (1,)
(1, 12) (1,)
(0, 12) (0,)
AUC: 0.875 +/- 0.1767766952966369
FPR: 0.0 +/- 0.0
(4, 12) (4,)
(1, 12) (1,)
AUC: 0.8489583333333333 +/- 0.15857643070456592
FPR: 0.03125 +/- 0.08267972847076846
(2, 12) (2,)
(3, 12) (3,)
(0, 12) (0,)
(5, 12) (5,)
AUC: 0.701388888888889 +/- 0.21302585625247178
FPR: 0.09722222222222221 +/- 0.13958160584890125
(1, 12) (1,)
(0, 12) (0,)
(1, 12) (1,)
(0, 12) (0,)
(1, 12) (1,)
(1, 12) (1,)
(1, 12) (1,)
(0, 12) (0,)
AUC: 0.75 +/- 0.25
FPR: 0.0 +/- 0.0
(1, 12) (1,)
(1, 12) (1,)
(4, 12) (4,)
(0, 12) (0,)
(1, 12) (1,)
(1, 12) (1,)
(1, 12) (1,)
(0, 12) (0,)
AUC: 0.75 +/- 0.25
FPR: 0.0 +/- 0.0
(5, 12) (5,)
(1, 12) (1,)
AUC: 0.8385416666666666 +/- 0.16196158130693972
FPR: 0.