Data exploration notebook


In [20]:
# Import packages
from pathlib import Path
import pandas as pd
import sys

# Define path from the `notebooks/` directory
current_dir = Path.cwd()
project_root = current_dir.parent

# Add the project root to sys.path if it's not already there
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Define main paths
raw_path = project_root / 'data' / 'raw'
metadata_path = project_root / 'data' / 'metadata'

# Define individual file paths
train_raw_path = raw_path / 'train.csv'
test_raw_path = raw_path / 'test.csv'
validation_raw_path = raw_path / 'validation.csv'

In [21]:
# Load data csvs
train_raw_data = pd.read_csv(train_raw_path)
test_raw_data = pd.read_csv(test_raw_path)
validation_raw_data = pd.read_csv(validation_raw_path)

In [22]:
# Optional: Inspect head of raw datasets
train_raw_data.head()
test_raw_data.head()
validation_raw_data.head()

Unnamed: 0,text,target,SOURCE_FILE,tweet_id,filename,event_type,event_type_detail,label
0,"RT @violetposie: all of the above, in that ord...",0,150k_archiveteam,1.15883e+18,150k_archiveteam,unknown,unknown,target_zero
1,"""After China warns India, Baloch and Sindhi le...",1,200k_crisis_datasets_benchmarks_v1.0_informati...,7.702281e+17,crisis_consolidated_informativeness_filtered_l...,unknown,unknown,informative
2,@Sollygc very - but I managed to get a stubby ...,0,22k_ACL_ICWSM_2018_datasets_acl_icwsm_clean.csv,2.965785e+17,2013_Queensland_Floods_train.tsv,flood,flood,not_relevant
3,"Sneak attack from coast to do, I see, the dead...",0,12k_tweets.csv_kaggle2_clean.csv,,12k_tweets.csv_kaggle2_clean.csv,unknown,unknown,unknown
4,Eastern and Western Attica declared in a state...,1,76k_HumAID_humaid_clean.csv,1.021746e+18,greece_wildfires_2018_train.tsv,fire,wild_fire,caution_and_advice


In [23]:
# Keep important columns and information
columns_to_keep = ['clean_text', 'sentiment', 'event_type', 'event_type_detail', 'label'] 
datasets = [train_raw_data, test_raw_data, validation_raw_data]

# Import TextCleaner class
from src.data.preprocessing import text_utils

for i in range(len(datasets)):
    datasets[i].dropna(inplace=True) # Remove missing values
    datasets[i].rename(columns={'target': 'sentiment'}, inplace=True) # Rename column

    # Clean text
    cleaner = text_utils.TextCleaner()
    datasets[i]['clean_text'] = datasets[i]['text'].apply(cleaner.clean)

    # Only keep columns that exist in both the dataframe and our columns_to_keep list
    available_columns = [col for col in columns_to_keep if col in datasets[i].columns]
    datasets[i] = datasets[i][available_columns]

train_cleaned_data = datasets[0]
test_cleaned_data = datasets[1]
validation_cleaned_data = datasets[2]

In [24]:
# Inspect head of cleaned data
train_cleaned_data.head()
test_cleaned_data.head()
validation_cleaned_data.head()

Unnamed: 0,clean_text,sentiment,event_type,event_type_detail,label
0,retweet mention _ violetposie emoji _ all of t...,0,unknown,unknown,target_zero
1,""" after china warns india , baloch and sindhi ...",1,unknown,unknown,informative
2,mention _ sollygc very but i managed to get a ...,0,flood,flood,not_relevant
4,eastern and western attica declared in a state...,1,fire,wild_fire,caution_and_advice
5,let us all help those poor people in nepal . i...,1,earthquake,earthquake,relevant


In [25]:
# Export cleaned data to hadr/data/processed
datasets = {
    'train.csv': train_cleaned_data,
    'test.csv': test_cleaned_data,
    'validation.csv': validation_cleaned_data,
}

for data in datasets:
    file_path = project_root / 'data' / 'processed' / data
    datasets[data].to_csv(file_path, index=False)