In [46]:
import pandas as pd 

In [238]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

category = 'humor' 

root_df = pd.read_csv(f'/Users/rt853/UoB-HICCS-2025/data/datasets/binary_datasets/brown/{category}_binary.csv').dropna()
augmented_df = pd.read_csv(f'/Users/rt853/UoB-HICCS-2025/data/datasets/binary_datasets/augmented/{category}_binary.csv')

def preprocess_aug_df(augmented_df):
    augmented_df['binary'] = 1
    augmented_df['source_text'] = augmented_df['text']
    augmented_df = augmented_df.drop(columns='text')
    augmented_df = augmented_df.rename(columns={'augmented_text': 'text'})
    augmented_df = augmented_df[['text', 'source_text', 'binary', 'category', 'line_id', 'document_id', 'new_document_id']]
    return augmented_df

def preprocess_text(text):
    # Initialize stemmer and stopwords
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    # Rejoin words into a single string
    text = ' '.join(words)
    return text

augmented_df = preprocess_aug_df(augmented_df)

root_df['text'] = root_df['text'].apply(preprocess_text)
augmented_df['text'] = augmented_df['text'].apply(preprocess_text)

In [239]:
from sklearn.model_selection import train_test_split



In [240]:
train_df, test_df = train_test_split(root_df, test_size=0.2, random_state=42)
train_w_aug = pd.concat([train_df, augmented_df], ignore_index=True)

print(f'Training set size: \n{train_df.binary.value_counts()}')
print(f'\n\nTraining with augmented set size: \n{train_w_aug.binary.value_counts()}')

Training set size: 
binary
0    42569
1      729
Name: count, dtype: int64


Training with augmented set size: 
binary
0    42569
1     1069
Name: count, dtype: int64


In [241]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [247]:
training_data = train_w_aug 

In [248]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), 
    max_features=10000,
    stop_words='english').fit(training_data['text'])

In [249]:
clf = XGBClassifier(eval_metric='logloss', n_jobs=-1)

pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', clf)
])

pipeline.fit(training_data['text'], training_data['binary'])
predictions = pipeline.predict(test_df['text'])

In [250]:
from sklearn.metrics import classification_report, accuracy_score

In [251]:
report = classification_report(test_df['binary'], predictions, output_dict=False)
accuracy = accuracy_score(test_df['binary'], predictions)

print(f'Accuracy: {accuracy}')
print(f'Classification Report: \n{report}')

Accuracy: 0.9828175519630485
Classification Report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10623
           1       0.90      0.09      0.16       202

    accuracy                           0.98     10825
   macro avg       0.94      0.54      0.58     10825
weighted avg       0.98      0.98      0.98     10825



In [187]:
train_w_aug.binary.value_counts()

binary
0    40994
1     3699
Name: count, dtype: int64