## Classification Notes for Future Meeting (9/04/2024)
Notes for a future meeting - currently playing around with classification !

## Setup

In [5]:
import pathlib 
import pandas as pd 
from xgboost import XGBClassifier
from sklearn import metrics

import sys
sys.path.append(str(pathlib.Path.cwd().parents[0]/ "src" / "classify"))
from prepare_data import load_metrics, create_split

In [6]:
path = pathlib.Path.cwd()

## Stories (Temp 1) Classification

In [15]:
# data 
datapath = path.parents[0] / "metrics"
stories1_df = load_metrics(datapath, dataset="stories", temp=1)

[INFO:] Loading data from human_metrics ...
[INFO:] Loading data from ai_metrics ...


In [16]:
clf = XGBClassifier(enable_categorical=True, use_label_encoder=False, random_state=129)

# all features
splits = create_split(stories1_df, random_state=129, val_test_size=0.15, outcome_col="is_human", verbose=False)

clf.fit(splits["X_train"], splits["y_train"])

# eval
y_pred = clf.predict(splits["X_val"])
print(metrics.classification_report(splits["y_val"], y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3000
           1       1.00      1.00      1.00      1500

    accuracy                           1.00      4500
   macro avg       1.00      1.00      1.00      4500
weighted avg       1.00      1.00      1.00      4500



In [17]:
features = ["doc_length"]
splits = create_split(stories1_df, random_state=129,
                        val_test_size=0.15, 
                        outcome_col="is_human", 
                        verbose=False, feature_cols=features
                        )

clf.fit(splits["X_train"], splits["y_train"])

# eval
y_pred = clf.predict(splits["X_val"])
print(metrics.classification_report(splits["y_val"], y_pred))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      3000
           1       0.86      0.67      0.75      1500

    accuracy                           0.85      4500
   macro avg       0.85      0.81      0.82      4500
weighted avg       0.85      0.85      0.85      4500



## Imbalanced Dataset? 

In [50]:
# define funtion to check imbalance from splits dict 
def check_imbalance(datapath):
    # create dict for saving imbalances
    imbalance = {}

    for dataset in ["stories", "mrpc", "dailydialog", "dailymail_cnn"]:
        df = load_metrics(datapath, dataset=dataset, temp=1)
        splits = create_split(df, random_state=129, val_test_size=0.15, outcome_col="is_human", verbose=False)
        
        # dict with counts for each split
        imbalance[dataset] = {'train': splits['y_train'].value_counts(),
                              'val': splits['y_val'].value_counts(),
                              'test': splits['y_test'].value_counts()}
        
    # convert to df 
    imbalance_df = pd.DataFrame.from_dict({(dataset, split): imbalance[dataset][split] for dataset in imbalance.keys() for split in imbalance[dataset].keys()}, orient='index')
    imbalance_df.index.names = ['Dataset', 'Split']
    imbalance_df.reset_index(inplace=True)

    return imbalance_df

In [51]:
imbalance_df = check_imbalance(datapath)
print(imbalance_df) # where 0 is ai and 1 is human

[INFO:] Loading data from human_metrics ...
[INFO:] Loading data from ai_metrics ...
[INFO:] Loading data from human_metrics ...
[INFO:] Loading data from ai_metrics ...
[INFO:] Loading data from human_metrics ...
[INFO:] Loading data from ai_metrics ...
[INFO:] Loading data from human_metrics ...
[INFO:] Loading data from ai_metrics ...
          Dataset  Split      0     1
0         stories  train  14000  7000
1         stories    val   3000  1500
2         stories   test   3000  1500
3            mrpc  train  10920  5460
4            mrpc    val   2340  1170
5            mrpc   test   2340  1170
6     dailydialog  train  14000  7000
7     dailydialog    val   3000  1500
8     dailydialog   test   3000  1500
9   dailymail_cnn  train   8400  4200
10  dailymail_cnn    val   1800   900
11  dailymail_cnn   test   1800   900
