<a href="https://colab.research.google.com/github/roseandgold/HatefulMemesProject/blob/main/Data%20Preprocessing%20and%20EDA/Preprocess_Meme_Image_and_Text_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1: Import Libraries

In [1]:
%%capture
!pip install sentence-transformers

In [2]:
# Import libraries
import pandas as pd
import torch
import torchvision
from PIL import Image
import numpy as np
import pickle
from tqdm import tqdm
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer
import random
%matplotlib inline

## Step 2: Read in the Data

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Import necessary files
train_file = '/content/drive/MyDrive/SIADS 697 - Capstone/SIADS 697 - Capstone/hateful_memes/train.jsonl'
train_data = pd.read_json(train_file, lines = True)
train_data.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


## Step 3: Get a random sample of both positive and negative memes

In [5]:
# Create separate dataframes
positive = train_data[train_data.label == 1]
negative = train_data[train_data.label == 0]

# Get a random sample from each
positive_sample = positive.sample(60, random_state = 42)
negative_sample = negative.sample(60, random_state = 42)

# Put the dataframes together
train_data = pd.concat([positive_sample, negative_sample], ignore_index = True)
train_data.head()

Unnamed: 0,id,img,label,text
0,52479,img/52479.png,1,best thing about an ethiopian blowjob?... you...
1,37615,img/37615.png,1,i thought of something funnier than 9/11 the h...
2,81975,img/81975.png,1,best thing about an ethiopian blowjob?... you ...
3,78610,img/78610.png,1,the difference between moderate muslim and rad...
4,38625,img/38625.png,1,a german ss soldier punishing a small jewish b...


In [8]:
# # Get just the image number
# train_data['img'] = train_data['img'].str.replace('img/','')

# Get a dictionary of testing and training images with labels
training_img = dict(zip(train_data['img'], zip(train_data['label'], train_data['text'])))

## Step 3: Transform the Images

In [6]:
# build the image transform and pretrained resnet
dim = 224
image_transform = torchvision.transforms.Compose(
    [
        torchvision.transforms.Resize(
            size = (dim, dim)
        ),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean = (0.485, 0.456, 0.406),
            std = (0.229, 0.224, 0.225)
        ),
    ]
)
 
vision_module = torchvision.models.resnet50(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [7]:
# Build the text transform
text_transformer = SentenceTransformer('paraphrase-mpnet-base-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Get the image files and resize them so they are all the same size
directory = '/content/drive/MyDrive/SIADS 697 - Capstone/SIADS 697 - Capstone/hateful_memes/'
train_images = []
train_labels = []
train_text = []
count = 0
img_size = 224
for file in tqdm(training_img):
  img = Image.open(directory + file).convert('RGB')
  img = image_transform(img).unsqueeze(dim=0) # resize
  img = vision_module(img) # resnet
  train_images.append(img)
  train_labels.append(training_img[file][0])
  train_text.append(text_transformer.encode(training_img[file][1]))

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
100%|██████████| 120/120 [01:09<00:00,  1.72it/s]


## Step 4: Combine the image and text information

In [11]:
combined = []
for idx, i in enumerate(train_images):
  image = i.squeeze().detach().numpy()
  combined.append(np.concatenate((image, train_text[idx])))

len(combined)

120

## Step 5: Pickle the data

In [None]:
def save_pickle(filepath, obj):
    with open(filepath, 'wb') as filehandler:
        pickle.dump(obj, filehandler)
    print('{} saved'.format(filepath))

directory = '/content/drive/MyDrive/SIADS 697 - Capstone/SIADS 697 - Capstone/hateful_memes/pickles/'    
train_file = directory + 'training_combo.p'
train_labels_file = directory + 'training_labels_combo.p'
save_pickle(train_file, combined)
save_pickle(train_labels_file, train_labels)

/content/drive/MyDrive/SIADS 697 - Capstone/SIADS 697 - Capstone/hateful_memes/pickles/training_combo.p saved
/content/drive/MyDrive/SIADS 697 - Capstone/SIADS 697 - Capstone/hateful_memes/pickles/training_labels_combo.p saved
