In [1]:
import pandas as pd
import re
import emoji
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv("data/train_data.csv")
test_data = pd.read_csv("data/test_data.csv")
val_data = pd.read_csv("data/val_data.csv")

In [3]:


def regex_clean(text):
    """Clean social media text while preserving important context"""
    
    # Handle @mentions - replace with generic token
    text = re.sub(r'@\w+', '@USER', text)
    
    # Handle hashtags - keep the text, remove #
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Handle URLs
    text = re.sub(r'http\S+|www\S+|https\S+', 'URL', text, flags=re.MULTILINE)
    
    # Handle repeated characters (sooooo -> so)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Handle emojis - convert to text description
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
def anonymize_with_ner(ner_pipeline, text):
    """Use NER to identify and anonymize personal information"""
    
    # Get entities
    entities = ner_pipeline(text)
    
    # Replace person names, locations, organizations
    for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
        if entity['entity'].startswith('B-PER') or entity['entity'].startswith('I-PER'):
            text = text[:entity['start']] + '[PERSON]' + text[entity['end']:]
        elif entity['entity'].startswith('B-LOC') or entity['entity'].startswith('I-LOC'):
            text = text[:entity['start']] + '[LOCATION]' + text[entity['end']:]
    
    return text

In [None]:
# Apply to your dataset
def preprocess_dataset(example):
    example['tweet'] = regex_clean(example['tweet'])
    example['tweet'] = anonymize_with_ner(example['tweet'])
    return example


# Load NER model
ner_pipeline = pipeline("ner", 
                       model="dbmdz/bert-large-cased-finetuned-conll03-english",
                       tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")


# Process each dataset
print("Processing training data...")
train_data['tweet'] = train_data['tweet'].apply(lambda x: regex_clean(x))
train_data['tweet'] = train_data['tweet'].apply(lambda x: anonymize_with_ner(ner_pipeline, x))

print("Processing validation data...")
val_data['tweet'] = val_data['tweet'].apply(lambda x: regex_clean(x))
val_data['tweet'] = val_data['tweet'].apply(lambda x: anonymize_with_ner(ner_pipeline, x))

print("Processing test data...")
test_data['tweet'] = test_data['tweet'].apply(lambda x: regex_clean(x))
test_data['tweet'] = test_data['tweet'].apply(lambda x: anonymize_with_ner(ner_pipeline, x))

# Save processed datasets
train_data.to_csv("data/processed_train_data.csv", index=False)
val_data.to_csv("data/processed_val_data.csv", index=False)
test_data.to_csv("data/processed_test_data.csv", index=False)

print("Processing complete! Saved to processed_*.csv files")

# Switch to google colab for gpu

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForToke

Processing training data...
