Data exploration notebook


In [None]:
# Import packages
import os
import pandas as pd

# Locate project paths
current_dir = os.path.dirname(os.path.abspath('1.0-data-exploration.ipynb')) # hadr/notebooks
project_root = os.path.dirname(current_dir) # hadr
raw_path = os.path.join(project_root, 'data', 'raw') # hadr/data/raw
metadata_path = os.path.join(project_root, 'data', 'metadata') # hadr/data/metadata
train_raw_path = os.path.join(raw_path, 'train.csv') # hadr/data/raw/train.csv
test_raw_path = os.path.join(raw_path, 'test.csv') # hadr/data/raw/test.csv
validation_raw_path = os.path.join(raw_path, 'validation.csv') # hadr/data/raw/validation.csv

In [62]:
# Load data csvs
train_raw_data = pd.read_csv(train_raw_path)
test_raw_data = pd.read_csv(test_raw_path)
validation_raw_data = pd.read_csv(validation_raw_path)

In [63]:
# Optional: Inspect head of raw datasets
train_raw_data.head()
test_raw_data.head()
validation_raw_data.head()

Unnamed: 0,text,target,SOURCE_FILE,tweet_id,filename,event_type,event_type_detail,label
0,"RT @violetposie: all of the above, in that ord...",0,150k_archiveteam,1.15883e+18,150k_archiveteam,unknown,unknown,target_zero
1,"""After China warns India, Baloch and Sindhi le...",1,200k_crisis_datasets_benchmarks_v1.0_informati...,7.702281e+17,crisis_consolidated_informativeness_filtered_l...,unknown,unknown,informative
2,@Sollygc very - but I managed to get a stubby ...,0,22k_ACL_ICWSM_2018_datasets_acl_icwsm_clean.csv,2.965785e+17,2013_Queensland_Floods_train.tsv,flood,flood,not_relevant
3,"Sneak attack from coast to do, I see, the dead...",0,12k_tweets.csv_kaggle2_clean.csv,,12k_tweets.csv_kaggle2_clean.csv,unknown,unknown,unknown
4,Eastern and Western Attica declared in a state...,1,76k_HumAID_humaid_clean.csv,1.021746e+18,greece_wildfires_2018_train.tsv,fire,wild_fire,caution_and_advice


In [None]:
import re
import emoji as ej

# Handling tags and encoding emojis
def encode_hashtags(text):
    return re.sub(r'#(\w+)', lambda m: f" hashtag_{m.group(1)} ", text)

def encode_mentions(text):
    return re.sub(r'@(\w+)', lambda m: f" mention_{m.group(1)} ", text)

def encode_emojis(text):
    demojized = ej.demojize(text)
    return re.sub(r':([^:]+):', lambda m: f" emoji_{m.group(1)} ", demojized)

def expand_abbreviations(text):
    abbreviations = {
        r'\bimo\b': 'in my opinion',
        r'\brt\b': 'retweet',
        r'\bu\b': 'you',
        r'\bur\b': 'your',
        r'\bomg\b': 'oh my god',
        r'\blmao\b': 'laughing my ass off',
        r'\bidk\b': 'i do not know',
        r'\bsmh\b': 'shaking my head',
        r'\basap\b': 'as soon as possible',
        r'\bdm\b': 'direct message',
        r'\bbtw\b': 'by the way',
        r'\btbh\b': 'to be honest',
        r'\bfyi\b': 'for your information'
    }

    for pattern, replacement in abbreviations.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    
    return text

def expand_contractions(text):
    contractions = {
        r"\bcan't\b": "cannot",
        r"\bwon't\b": "will not",
        r"\bi'm\b": "i am",
        r"\bthey're\b": "they are",
        r"\bwe're\b": "we are",
        r"\byou're\b": "you are",
        r"\bit's\b": "it is",
        r"\bdon't\b": "do not",
        r"\bdoesn't\b": "does not",
        r"\bdidn't\b": "did not",
        r"\bhasn't\b": "has not",
        r"\bhaven't\b": "have not",
        r"\bhadn't\b": "had not",
        r"\bwouldn't\b": "would not",
        r"\bcouldn't\b": "could not",
        r"\bshouldn't\b": "should not",
        r"\bi've\b": "i have",
        r"\byou've\b": "you have",
        r"\bthey've\b": "they have",
        r"\bwho's\b": "who is",
        r"\bwhat's\b": "what is",
        r"\bthat's\b": "that is",
        r"\blet's\b": "let us",
        r"\bi'll\b": "i will",
        r"\byou'll\b": "you will",
        r"\bthey'll\b": "they will",
        r"\bit'll\b": "it will",
        r"\bthere's\b": "there is"
    }

    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    return text

# Final data cleaner function
def clean_text(text):
    text = encode_emojis(text) # Encode all emojis

    text = encode_hashtags(text) # Encode all hashtags

    text = encode_mentions(text) # Encode all mentions (@username)

    text = expand_abbreviations(text) # Expand all abbreviations

    text = expand_contractions(text) # Expand all contractions
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove all URLs

    text = re.sub(r'\S+@\S+', '', text)  # Remove emails

    text = re.sub(r"[^a-zA-Z0-9\s\.\,\!\?_'’]", '', text) # Keep apostrophes

    text = re.sub(r'([.,!?_])', r' \1 ', text) # Space out punctuation for clear tokenization:

    text = text.lower() # Convert to lowercase

    text = ' '.join(text.split()) # Remove extra whitespaces

    return text

In [94]:
import re
import emoji as ej

class TextCleaner:
    def __init__(self, remove_profanity=False, preserve_quotes=True):
        self.remove_profanity = remove_profanity
        self.preserve_quotes = preserve_quotes

        self.abbreviations = {
            r'\bimo\b': 'in my opinion',
            r'\brt\b': 'retweet',
            r'\bu\b': 'you',
            r'\bur\b': 'your',
            r'\bomg\b': 'oh my god',
            r'\blmao\b': 'laughing my ass off',
            r'\bidk\b': 'i do not know',
            r'\bsmh\b': 'shaking my head',
            r'\basap\b': 'as soon as possible',
            r'\bdm\b': 'direct message',
            r'\bbtw\b': 'by the way',
            r'\btbh\b': 'to be honest',
            r'\bfyi\b': 'for your information'
        }

        self.contractions = {
            r"\bcan't\b": "cannot",
            r"\bwon't\b": "will not",
            r"\bi'm\b": "i am",
            r"\bthey're\b": "they are",
            r"\bwe're\b": "we are",
            r"\byou're\b": "you are",
            r"\bit's\b": "it is",
            r"\bdon't\b": "do not",
            r"\bdoesn't\b": "does not",
            r"\bdidn't\b": "did not",
            r"\bhasn't\b": "has not",
            r"\bhaven't\b": "have not",
            r"\bhadn't\b": "had not",
            r"\bwouldn't\b": "would not",
            r"\bcouldn't\b": "could not",
            r"\bshouldn't\b": "should not",
            r"\bi've\b": "i have",
            r"\byou've\b": "you have",
            r"\bthey've\b": "they have",
            r"\bwho's\b": "who is",
            r"\bwhat's\b": "what is",
            r"\bthat's\b": "that is",
            r"\blet's\b": "let us",
            r"\bi'll\b": "i will",
            r"\byou'll\b": "you will",
            r"\bthey'll\b": "they will",
            r"\bit'll\b": "it will",
            r"\bthere's\b": "there is"
        }

        self.profanity_list = [r'\bshit\b', r'\bfuck\b', r'\bass\b', r'\bdamn\b']  # Extend as needed

    def encode_hashtags(self, text):
        return re.sub(r'#(\w+)', lambda m: f" hashtag_{m.group(1)} ", text)

    def encode_mentions(self, text):
        return re.sub(r'@(\w+)', lambda m: f" mention_{m.group(1)} ", text)

    def encode_emojis(self, text):
        demojized = ej.demojize(text)
        return re.sub(r':([^:]+):', lambda m: f" emoji_{m.group(1)} ", demojized)

    def expand_abbreviations(self, text):
        for pattern, replacement in self.abbreviations.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        return text

    def expand_contractions(self, text):
        for pattern, replacement in self.contractions.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        return text

    def remove_noise(self, text):
        text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)
        text = re.sub(r'\S+@\S+', '', text)  # Remove emails
        return text

    def remove_profanity_words(self, text):
        for word in self.profanity_list:
            text = re.sub(word, '[censored]', text, flags=re.IGNORECASE)
        return text

    def clean(self, text):
        if not isinstance(text, str):
            return ''

        text = self.encode_emojis(text)
        text = self.encode_hashtags(text)
        text = self.encode_mentions(text)
        text = self.expand_abbreviations(text)
        text = self.expand_contractions(text)
        text = self.remove_noise(text)

        # Base pattern
        if self.preserve_quotes:
            pattern = r"[^a-zA-Z0-9\s.,!?_'’\"]"  # Preserve quotes
        else:
            pattern = r"[^a-zA-Z0-9\s.,!?_'’]"    # Remove quotes

        text = re.sub(pattern, '', text)

        # Space out punctuation
        if self.preserve_quotes:
            text = re.sub(r'([.,!?_"\'])', r' \1 ', text)
        else:
            text = re.sub(r'([.,!?_])', r' \1 ', text)

        if self.remove_profanity:
            text = self.remove_profanity_words(text)

        text = text.lower()
        text = ' '.join(text.split())

        return text

In [95]:
# Keep important columns and information
columns_to_keep = ['clean_text', 'sentiment', 'event_type', 'event_type_detail', 'label'] 
datasets = [train_raw_data, test_raw_data, validation_raw_data]

for i in range(len(datasets)):
    datasets[i].dropna(inplace=True) # Remove missing values
    datasets[i].rename(columns={'target': 'sentiment'}, inplace=True) # Rename column

    # Clean text
    cleaner = TextCleaner()
    datasets[i]['clean_text'] = datasets[i]['text'].apply(cleaner.clean)

    # Only keep columns that exist in both the dataframe and our columns_to_keep list
    available_columns = [col for col in columns_to_keep if col in datasets[i].columns]
    datasets[i] = datasets[i][available_columns]

train_cleaned_data = datasets[0]
test_cleaned_data = datasets[1]
validation_cleaned_data = datasets[2]

In [92]:
# Inspect head of cleaned data
train_cleaned_data.head()
test_cleaned_data.head()
validation_cleaned_data.head()

Unnamed: 0,clean_text,sentiment,event_type,event_type_detail,label
0,retweet mention _ violetposie emoji _ all of t...,0,unknown,unknown,target_zero
1,"after china warns india , baloch and sindhi le...",1,unknown,unknown,informative
2,mention _ sollygc very but i managed to get a ...,0,flood,flood,not_relevant
4,eastern and western attica declared in a state...,1,fire,wild_fire,caution_and_advice
5,let us all help those poor people in nepal . i...,1,earthquake,earthquake,relevant


In [96]:
# Export cleaned data to hadr/data/processed
datasets = {
    'train.csv': train_cleaned_data,
    'test.csv': test_cleaned_data,
    'validation.csv': validation_cleaned_data,
}

for data in datasets:
    file_path = os.path.join(project_root, 'data', 'processed', data)
    datasets[data].to_csv(file_path, index=False)