https://www.kaggle.com/coolcoder22/simple-logisticregression

In [None]:
from IPython.display import HTML
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
import re
import string
import lightgbm as lgb

In [None]:
train_df = pd.read_csv('../input/train.csv')

In [None]:
annot_idx = train_df[train_df['identity_annotator_count'] > 0].sample(n=48660, random_state=13).index
not_annot_idx = train_df[train_df['identity_annotator_count'] == 0].sample(n=48660, random_state=13).index
x_val_idx = list(set(annot_idx).union(set(not_annot_idx)))

X_val = train_df.loc[x_val_idx]
X_train = train_df.loc[list(set(train_df.index) - set(x_val_idx))]

In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
text = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return text.sub(r' \1 ', s)

In [None]:
length = train_df.shape[0]
word_vectorizer = TfidfVectorizer(ngram_range=(1,2),
               min_df=5, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000)

In [None]:
word_vectorizer.fit(X_train['comment_text'])

In [None]:
train_tfidf = word_vectorizer.transform(X_train['comment_text'])
val_tfidf = word_vectorizer.transform(X_val['comment_text'])

In [None]:
print(train_tfidf.shape)
print(val_tfidf.shape)

In [None]:
import pickle

with open('word_vectorizer.pickle', 'wb') as handle:
    pickle.dump(word_vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)
        
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
y_train = np.where(X_train['target'] >= 0.5, 1, 0)
y_val = np.where(X_val['target'] >= 0.5, 1, 0)

In [None]:
NbSvm = NbSvmClassifier(C=1.5, dual=True, n_jobs=-1)
NbSvm.fit(train_tfidf, y_train)

In [None]:
lr = LogisticRegression(solver='lbfgs', random_state=13)
lr.fit(train_tfidf, y_train)

In [None]:
with open('lr_model.pickle', 'wb') as handle:
    pickle.dump(lr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
lgb_train = lgb.Dataset(train_tfidf, y_train)
lgb_eval = lgb.Dataset(val_tfidf, y_val, reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective':'binary',
    'metric': {'auc'},
    'nthread': -1,
    'feature_fraction': 0.4,
    'num_leaves': 50,
    'verbose': 1,
    'num_iterations': 500
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200,
                valid_sets=lgb_eval,
                early_stopping_rounds=20)

In [None]:
with open('gbm_model.pickle', 'wb') as handle:
    pickle.dump(gbm, handle, protocol=pickle.HIGHEST_PROTOCOL)

**VALIDATION PART**

In [None]:
X_val['model_nbsvm'] = NbSvm.predict_proba(val_tfidf)[:, 1]
X_val['model_lr'] = lr.predict_proba(val_tfidf)[:, 1]
X_val['model_gbm'] = gbm.predict(val_tfidf)

In [None]:
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

val_df = convert_dataframe_to_bool(X_val)

In [None]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)


def calculate_overall_auc(df, model_name):
    true_labels = df['target']
    predicted_labels = df[model_name]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

In [None]:
model_cols = [col for col in val_df.columns if (col.startswith('model_'))]
for m_col in model_cols:
    bias_metrics_df = compute_bias_metrics_for_model(val_df, identity_columns, m_col, 'target')
    print(m_col)
    print(get_final_metric(bias_metrics_df, calculate_overall_auc(val_df, m_col)))