In [None]:
import sys; sys.path.append('..')
from osp import *
pd.options.display.max_colwidth = 200

In [None]:
def classify_data(data, target_col='_target', cv=5, verbose=True, balance=False, normalize=False, **kwargs):
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report, accuracy_score
    from sklearn.model_selection import cross_val_predict
    import numpy as np

    # Ensure data is clean (fill NaNs)
    _data = data.copy()
    if not target_col in _data.columns:
        _data[target_col] = _data.index.str.split('/').str[0]

    if balance:
        min_target_size = min(_data[target_col].value_counts())
        _data = _data.groupby(target_col).sample(n=min_target_size)
        print(f'Balanced data: {min_target_size} samples per target')
    
    df_data = _data.drop(columns=[target_col])
    for c in df_data:
        df_data[c] = pd.to_numeric(df_data[c], errors='coerce')
    X_data_norm = df_data.fillna(0).values
    y_data = _data[target_col].fillna('').values

    # Initialize Logistic Regression
    model = LogisticRegression(class_weight='balanced', max_iter=1000)

    if verbose:
        print(f"Running {cv}-fold Cross-Validation on {len(X_data_norm)} samples...")
    
    # Get predictions and probabilities for all items in the balanced set
    y_pred = cross_val_predict(model, X_data_norm, y_data, cv=cv, n_jobs=1)
    y_probas = cross_val_predict(model, X_data_norm, y_data, cv=cv, n_jobs=1, method='predict_proba')
    
    # Confidence is the maximum probability across classes
    confidence_scores = np.max(y_probas, axis=1)

    accuracy = accuracy_score(y_data, y_pred)
    if verbose:
        print(f"\nClassifier Results ({cv}-fold CV):")
        print(f"Accuracy: {accuracy:.4f}")
        print(classification_report(y_data, y_pred))

    # Fit on all data to get final feature weights
    model.fit(X_data_norm, y_data)
    feature_names = _data.drop(columns=[target_col]).columns

    if len(model.classes_) <= 2:
        # Binary case: coef_ is (1, n_features)
        weights_df = pd.DataFrame({
            'feature': feature_names,
            'weight': model.coef_[0]
        }).sort_values('weight', ascending=False)
    else:
        # Multi-class case: coef_ is (n_classes, n_features)
        weights_df = pd.DataFrame(model.coef_.T, columns=model.classes_, index=feature_names)
        weights_df.index.name = 'feature'
        weights_df = weights_df.reset_index()

    # Return a DataFrame of relevant information
    test_label = ' / '.join(model.classes_)
    results_df = pd.DataFrame({
        'id': _data.index,
        'true_label': y_data,
        'pred_label': y_pred,
        'test_label': test_label,
        'confidence': confidence_scores,
        'correct': (y_pred == y_data),
        'accuracy': accuracy,
        'support': _data.shape[0],
    })
    results_df.set_index('id', inplace=True)
    return results_df, weights_df, model

In [None]:
df_meta = get_corpus_metadata(min_year=1925, max_year=2025)
df_phil = df_meta[df_meta['discipline'] == 'Philosophy']
df_lit = df_meta[df_meta['discipline'] == 'Literature']

In [None]:
def get_balanced_cv_data(groups_train, target_col='discipline', balance=True, normalize=True):
    df_meta = get_corpus_metadata()
    name1, query1 = groups_train[0]
    name2, query2 = groups_train[1]

    df_meta1 = df_meta.query(query1)
    df_meta2 = df_meta.query(query2)

    
    df_scores1 = get_feat_counts(df_meta1.index.tolist(), normalize=normalize)
    df_scores2 = get_feat_counts(df_meta2.index.tolist(), normalize=normalize)

    df_scores_cv = pd.concat([df_scores1, df_scores2]).assign(_type='CV')
    df_scores_cv['_target'] = [get_text_metadata(i).get(target_col) for i in tqdm(df_scores_cv.index)]
    df_scores_cv = df_scores_cv.dropna(subset=['_target'])
    if balance:
        df_scores_cv = df_scores_cv.groupby('_target').sample(n=min(df_scores_cv['_target'].value_counts()))

    df_scores_all = get_all_feats(normalize=True).fillna(0)
    df_scores_all['_target'] = [get_text_metadata(i).get(target_col) for i in tqdm(df_scores_all.index)]
    df_scores_all = df_scores_all.dropna(subset=['_target'])
    df_scores_rest = df_scores_all.drop(df_scores_cv.index).assign(_type='Unseen')

    df_scores = pd.concat([df_scores_cv, df_scores_rest])
    return df_scores

In [None]:
groups_train = [
    ('C21 Philosophy', 'discipline=="Philosophy" & 2000<=year<2025'),
    ('C21 Literature', 'discipline=="Literature" & 2000<=year<2025'),
]

# df_scores = get_balanced_cv_data(groups_train)
# df_scores.groupby(['_type','_target']).size()

In [None]:
# cv_preds, cv_feats, cv_model = classify_data(
#     df_scores.query('_type=="CV"').drop(columns=['_type']),
#     target_col='_target',
#     cv=5,
#     verbose=True,
#     balance=True
# )
# cv_preds

In [42]:
def classify_then_predict(groups_train, target_col='discipline', balance=True, num_runs=1):
    l_preds=[]
    l_feats=[]
    for nrun in range(num_runs):
        df_scores = get_balanced_cv_data(groups_train, target_col=target_col, balance=balance)

        cv_preds, cv_feats, cv_model = classify_data(
            df_scores.query('_type=="CV"').drop(columns=['_type']),
            target_col='_target',
            cv=5,
            verbose=True,
            balance=True
        )
        
        new_target = df_scores._target.tolist()
        new_probs = cv_model.predict_proba(df_scores.drop(columns=['_type','_target']))
        df_new_probs = pd.DataFrame(new_probs)
        df_new_probs.columns = cv_model.classes_
        df_new_probs['pred_label'] = df_new_probs.idxmax(axis=1)
        df_new_probs.columns = ['prob1','prob2','pred_label']
        df_new_probs['true_label'] = new_target
        df_new_probs['test_label'] = ' / '.join(cv_model.classes_)
        df_new_probs['id'] = df_scores.index
        df_new_probs['predict_type'] = 'unseen'
        df_new_probs.set_index('id', inplace=True)
        # df_new_probs
        l_preds.append(df_new_probs)
        l_feats.append(cv_feats)
    
    df_preds = pd.concat(l_preds)
    df_feats = pd.concat(l_feats)
    return df_preds, get_df_feats_with_pos_mdw(df_feats, groups_train)

In [43]:
df_preds, df_feats = classify_then_predict(groups_train)

100%|██████████| 12789/12789 [00:00<00:00, 40805.35it/s]
100%|██████████| 33665/33665 [00:00<00:00, 42564.84it/s]


Balanced data: 3110 samples per target
Running 5-fold Cross-Validation on 6220 samples...

Classifier Results (5-fold CV):
Accuracy: 0.6471
              precision    recall  f1-score   support

  Literature       0.64      0.66      0.65      3110
  Philosophy       0.65      0.63      0.64      3110

    accuracy                           0.65      6220
   macro avg       0.65      0.65      0.65      6220
weighted avg       0.65      0.65      0.65      6220



In [None]:
df_preds.groupby('pred_label').mean(numeric_only=True)

In [None]:
df_feats

In [None]:
df_preds['text_id'] = [i.split('__')[0] for i in df_preds.index]
odf = df_preds.rename_axis('slice_id').reset_index().merge(get_corpus_metadata(), left_on='text_id', right_on='id', how='left')
odf.to_excel('../data/preds2.xlsx')

In [None]:
odf