<a href="https://colab.research.google.com/github/sellerstx1982/multimodal_damage_identification/blob/Vargas/project_3_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imported Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from pathlib import Path
from sklearn.metrics import classification_report, balanced_accuracy_score, accuracy_score, classification_report
from PIL import Image, ImageFile
import os
import matplotlib.pyplot as plt
import zipfile
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
import pickle

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loaded in Dataset and Created Lists for Data Storage

In [5]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
main_folder = "/content/drive/MyDrive/Project_3/multimodal+damage+identification+for+humanitarian+computing/multimodal"

In [6]:
# Lists to store the data
images = []
texts = []
labels = []

# Preprocessing function for images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((250, 250))  # Example size, adjust accordingly
        image = np.array(image) / 255.0  # Normalize pixel values
        return np.expand_dims(image, axis=0)  # Add batch dimension
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Function to load text from a file
def load_text_file(text_path):
    try:
        with open(text_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading text file {text_path}: {e}")
        return None

In [7]:
# Iterate over each category subfolder in the main folder
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # The image and text folders inside each category folder
        image_folder = os.path.join(subfolder_path, 'images')
        text_folder = os.path.join(subfolder_path, 'text')

        # Check if both the image and text folders exist
        if os.path.exists(image_folder) and os.path.exists(text_folder):
            for image_file in os.listdir(image_folder):
                if image_file.endswith(('jpg', 'jpeg', 'png')):
                    image_path = os.path.join(image_folder, image_file)
                    # Construct the corresponding text file name
                    text_file = image_file.rsplit('.', 1)[0] + '.txt'
                    text_path = os.path.join(text_folder, text_file)

                    # Check if the text file exists and load both image and text
                    if os.path.exists(text_path):
                        image = preprocess_image(image_path)
                        if image is not None:
                            text = load_text_file(text_path)
                            if text is not None:
                                # Extract the label by splitting at the underscore and taking the first part
                                label = image_file.split('_', 1)[0]
                                images.append(image)
                                texts.append(text)
                                labels.append(label)  # Append the prefix before the first underscore as label

In [8]:
type(images)

list

In [9]:
print(len(images))
print(len(texts))
print(len(labels))

5831
5831
5831


In [10]:
df = pd.DataFrame({'image': images, 'text': texts, 'label': labels})
df.head()

Unnamed: 0,image,text,label
0,"[[[[0.82352941 0.90588235 0.97254902], [0.8235...",#isiscrimes \nIsis dogs crucified a men accuse...,isiscrimes
1,"[[[[0.99607843 0.99607843 0.99607843], [0.9960...",Part 2:\n\nIn January of this year Abu Rayyan ...,isiscrimes
2,"[[[[0.99607843 0.99607843 0.99607843], [0.9960...",Wake up.ppl see the reality to what ISIS is al...,isiscrimes
3,"[[[[0.58823529 0.44705882 0.34509804], [0.5803...",New image from akhtarin in aleppo #Syria yeste...,isiscrimes
4,"[[[[0.4627451 0.38039216 0.2745098 ], [0.4627...",Syria #Aleppo ISIS continue on with their behe...,isiscrimes


In [11]:
df.tail()

Unnamed: 0,image,text,label
5826,"[[[[0.68235294 0.80392157 0.98431373], [0.6784...",Happy Thanksgiving Peeps! Much to be thankful ...,hurricaneirma
5827,"[[[[0.74901961 0.80784314 0.82745098], [0.7490...",🌥 #hurricaneirma #vistrong 🇻🇮,hurricaneirma
5828,"[[[[0.62745098 0.69019608 0.79215686], [0.6470...",Hurricane Irma was here,hurricaneirma
5829,"[[[[0.26666667 0.26666667 0.16862745], [0.5137...",#goodbyehouse #childhoomhome #hurricaneirma #h...,hurricaneirma
5830,"[[[[0.14117647 0.38823529 0.41960784], [0.1686...",Arabella’s Trying To Get On The Roof Courtesy ...,hurricaneirma


In [12]:
df['label_2'] = df['label'].replace({'destruction': 'destruction',
                                   'drought': 'drought',
                                   'buildingcollapse': 'earthquake',
                                   'destroyedbuilding': 'earthquake',
                                   'earthquake': 'earthquake',
                                   'earthquakenepal': 'earthquake',
                                   'buildingfire': 'fire',
                                   'forestfire': 'fire',
                                   'forrestfire': 'fire',
                                   'wildfires': 'fire',
                                   'accrafloods': 'flood',
                                   'floodwater': 'flood',
                                   'hurricaneharvey': 'hurricane',
                                   'hurricaneirma': 'hurricane',
                                   'hurricanematthew': 'hurricane',
                                   'hurricanesandy': 'hurricane',
                                   'sandydamage': 'hurricane',
                                   'treefalling': 'hurricane',
                                   'treesfalling': 'hurricane',
                                   'windstorm': 'hurricane',
                                   'landslide': 'landslide',
                                   'disaster': 'naturaldisaster',
                                   'disasters': 'naturaldisaster',
                                   'naturaldamage': 'naturaldisaster',
                                   'naturaldisaster': 'naturaldisaster',
                                   'ad': 'non-damage',
                                   'building': 'non-damage',
                                   'cars': 'non-damage',
                                   'food': 'non-damage',
                                   'nature': 'non-damage',
                                   'terrorattack': 'war',
                                   'isiscrimes': 'war',
                                   'yemencrisis': 'war',
                                   'suicidebombing': 'war',
                                   'victimsofwar': 'war',
                                   'war': 'war',
                                   'syriawarcrimes': 'war',
                                   'warsyria': 'war',
                                   'yemencrisis': 'war',
                                   'wreckedcar': 'wreckedcar',
                                   })

In [13]:
df['label_3'] = df['label'].replace({'destruction': 'damage',
                                   'drought': 'damage',
                                   'buildingcollapse': 'damage',
                                   'destroyedbuilding': 'damage',
                                   'earthquake': 'damage',
                                   'earthquakenepal': 'damage',
                                   'buildingfire': 'damage',
                                   'forestfire': 'damage',
                                   'forrestfire': 'damage',
                                   'wildfires': 'damage',
                                   'accrafloods': 'damage',
                                   'floodwater': 'damage',
                                   'hurricaneharvey': 'damage',
                                   'hurricaneirma': 'damage',
                                   'hurricanematthew': 'damage',
                                   'hurricanesandy': 'damage',
                                   'sandydamage': 'damage',
                                   'treefalling': 'damage',
                                   'treesfalling': 'damage',
                                   'windstorm': 'damage',
                                   'landslide': 'damage',
                                   'disaster': 'damage',
                                   'disasters': 'damage',
                                   'naturaldamage': 'damage',
                                   'naturaldisaster': 'damage',
                                   'ad': 'non-damage',
                                   'building': 'non-damage',
                                   'cars': 'non-damage',
                                   'food': 'non-damage',
                                   'nature': 'non-damage',
                                   'terrorattack': 'damage',
                                   'isiscrimes': 'damage',
                                   'yemencrisis': 'damage',
                                   'suicidebombing': 'damage',
                                   'victimsofwar': 'damage',
                                   'war': 'damage',
                                   'syriawarcrimes': 'damage',
                                   'warsyria': 'damage',
                                   'yemencrisis': 'damage',
                                   'wreckedcar': 'damage',

                                   })

In [14]:
df['label_4'] = df['label'].replace({'destruction': 'non-fire',
                                   'drought': 'non-fire',
                                   'buildingcollapse': 'non-fire',
                                   'destroyedbuilding': 'non-fire',
                                   'earthquake': 'non-fire',
                                   'earthquakenepal': 'non-fire',
                                   'buildingfire': 'fire',
                                   'forestfire': 'fire',
                                   'forrestfire': 'fire',
                                   'wildfires': 'fire',
                                   'accrafloods': 'non-fire',
                                   'floodwater': 'non-fire',
                                   'hurricaneharvey': 'non-fire',
                                   'hurricaneirma': 'non-fire',
                                   'hurricanematthew': 'non-fire',
                                   'hurricanesandy': 'non-fire',
                                   'sandydamage': 'non-fire',
                                   'treefalling': 'non-fire',
                                   'treesfalling': 'non-fire',
                                   'windstorm': 'non-fire',
                                   'landslide': 'non-fire',
                                   'disaster': 'non-fire',
                                   'disasters': 'non-fire',
                                   'naturaldamage': 'non-fire',
                                   'naturaldisaster': 'non-fire',
                                   'ad': 'non-fire',
                                   'building': 'non-fire',
                                   'cars': 'non-fire',
                                   'food': 'non-fire',
                                   'nature': 'non-fire',
                                   'terrorattack': 'non-fire',
                                   'isiscrimes': 'non-fire',
                                   'yemencrisis': 'non-fire',
                                   'suicidebombing': 'non-fire',
                                   'victimsofwar': 'non-fire',
                                   'war': 'non-fire',
                                   'syriawarcrimes': 'non-fire',
                                   'warsyria': 'non-fire',
                                   'yemencrisis': 'non-fire',
                                   'wreckedcar': 'non-fire',

                                   })

In [15]:
df.head()

Unnamed: 0,image,text,label,label_2,label_3,label_4
0,"[[[[0.82352941 0.90588235 0.97254902], [0.8235...",#isiscrimes \nIsis dogs crucified a men accuse...,isiscrimes,war,damage,non-fire
1,"[[[[0.99607843 0.99607843 0.99607843], [0.9960...",Part 2:\n\nIn January of this year Abu Rayyan ...,isiscrimes,war,damage,non-fire
2,"[[[[0.99607843 0.99607843 0.99607843], [0.9960...",Wake up.ppl see the reality to what ISIS is al...,isiscrimes,war,damage,non-fire
3,"[[[[0.58823529 0.44705882 0.34509804], [0.5803...",New image from akhtarin in aleppo #Syria yeste...,isiscrimes,war,damage,non-fire
4,"[[[[0.4627451 0.38039216 0.2745098 ], [0.4627...",Syria #Aleppo ISIS continue on with their behe...,isiscrimes,war,damage,non-fire


In [16]:
df.tail()

Unnamed: 0,image,text,label,label_2,label_3,label_4
5826,"[[[[0.68235294 0.80392157 0.98431373], [0.6784...",Happy Thanksgiving Peeps! Much to be thankful ...,hurricaneirma,hurricane,damage,non-fire
5827,"[[[[0.74901961 0.80784314 0.82745098], [0.7490...",🌥 #hurricaneirma #vistrong 🇻🇮,hurricaneirma,hurricane,damage,non-fire
5828,"[[[[0.62745098 0.69019608 0.79215686], [0.6470...",Hurricane Irma was here,hurricaneirma,hurricane,damage,non-fire
5829,"[[[[0.26666667 0.26666667 0.16862745], [0.5137...",#goodbyehouse #childhoomhome #hurricaneirma #h...,hurricaneirma,hurricane,damage,non-fire
5830,"[[[[0.14117647 0.38823529 0.41960784], [0.1686...",Arabella’s Trying To Get On The Roof Courtesy ...,hurricaneirma,hurricane,damage,non-fire


In [17]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ad,1935
building,641
wreckedcar,400
nature,253
earthquake,226
sandydamage,225
floodwater,188
buildingcollapse,173
isiscrimes,163
forrestfire,131


Created Pickle File for Testing

In [18]:
pkl_df = df.sample(n=1000, random_state=42)
pkl_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4436 to 4841
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    1000 non-null   object
 1   text     1000 non-null   object
 2   label    1000 non-null   object
 3   label_2  1000 non-null   object
 4   label_3  1000 non-null   object
 5   label_4  1000 non-null   object
dtypes: object(6)
memory usage: 54.7+ KB


In [19]:
# Save the data to pkl files
with open('disaster_data.pkl', 'wb') as f:
    pickle.dump(pkl_df, f)

In [20]:
def load_data_from_pkl(pkl_filename):
    with open(pkl_filename, 'rb') as f:
        pkl_df = pickle.load(f)
    return pkl_df

In [21]:
pkl_df = load_data_from_pkl('/content/drive/MyDrive/Project_3/disaster_data.pkl')

In [22]:
pkl_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 4436 to 4841
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    1000 non-null   object
 1   text     1000 non-null   object
 2   label    1000 non-null   object
 3   label_2  1000 non-null   object
 4   label_3  1000 non-null   object
 5   label_4  1000 non-null   object
dtypes: object(6)
memory usage: 54.7+ KB
