In [None]:
!pip install -r req.txt
from google.colab import drive
drive.mount('/content/drive')
!apt install tesseract-ocr
!apt install libtesseract-dev
!pip install pytesseract
!pip install --upgrade urllib3
!pip install --upgrade pyopenssl
!pip install transformers

Collecting astor (from -r req.txt (line 2))
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting boto3 (from -r req.txt (line 6))
  Downloading boto3-1.28.64-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore (from -r req.txt (line 7))
  Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
Collecting google-images-download (from -r req.txt (line 18))
  Downloading google_images_download-2.8.0.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imageai (from -r req.txt (line 23))
  Downloading imageai-3.0.3-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.8/69.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting jedi (from -r req.txt (line 28))
  Downloadin

In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/train_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/data')

In [None]:
# import sys
# sys.path.append('/content/drive/MyDrive/testing/hate-speech-detection/')
import torch
import torchvision
from torch import nn
from torchvision import transforms
from torch.utils.data import DataLoader

from pytorch_transformers import *

import time

from tensorboardX import SummaryWriter

from tqdm import tqdm
import numpy as np

In [None]:
class MultimodalClassifier(nn.Module):
    def __init__(self, image_feat_model, text_feat_model, TOTAL_FEATURES,
                 USE_IMAGE, USE_TEXT, USE_HATE_WORDS, hidden_size, device):

        super(MultimodalClassifier, self).__init__()
        self.im_feat_model = image_feat_model
        self.text_feat_model = text_feat_model

        self.USE_IMAGE = USE_IMAGE
        self.USE_TEXT = USE_TEXT
        self.USE_HATE_WORDS = USE_HATE_WORDS

        self.TOTAL_FEATURES = TOTAL_FEATURES

        self.classifier = nn.Sequential(
            nn.Linear(TOTAL_FEATURES, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            # nn.Dropout(0.5),
            nn.Linear(hidden_size, 1),
            # nn.Softmax()
        )

        self.device = device


    def forward(self, image, text, hate_words):

        # with torch.no_grad():
        if self.USE_IMAGE == 1:
            batch_size = image.size()[0]
        elif self.USE_TEXT == 1:
            batch_size = image.size()[0]
        elif self.USE_HATE_WORDS == 1:
            batch_size = image.size()[0]

        features = torch.zeros(batch_size, 1).to(device)

        if self.USE_IMAGE == 1:
            image_features = self.im_feat_model(image)
            features = torch.cat((features, image_features), dim=1)

        if self.USE_TEXT == 1:
            last_hidden_states = self.text_feat_model(text)[0]
            text_features = torch.sum(last_hidden_states, dim=1)
            text_features = text_features / last_hidden_states.size()[1]
            features = torch.cat((features, text_features), dim=1)

        if self.USE_HATE_WORDS == 1:
            features = torch.cat((features, hate_words), dim=1)

        features = features[:, 1:]

        out = self.classifier(features)

        return out


In [None]:
def accuracy(output, target):
    """Computes the accuracy for multiple binary predictions"""
    pred = output >= 0.5
    truth = target >= 0.5
    acc = pred.eq(truth).sum()
    return acc
def validAccuracy(output, target):
    """Computes the accuracy for multiple binary predictions"""
    pred = output >= 0.5
    truth = target >= 0.5
    acc = pred.eq(truth)
    return acc

TOP_SIZE = 20
top_losses = []
fewer_losses = []

def getLossFromTuple(item):
    return item[1]

def validate(dataloader_valid, criterion, device):

    loss = 0
    acc = 0
    i = 0
    global top_losses
    global fewer_losses

    top_losses = []
    fewer_losses = []

    for batch in dataloader_valid:

        image_batch = batch["image"].to(device)
        text_batch = batch["bert_tokens"].to(device)
        hate_words_batch = batch["hate_words"].to(device)
        paths_batch = batch["image_paths"]

        target_batch = batch["class"].to(device)
        target_batch = target_batch.unsqueeze(1)



        with torch.no_grad():

            pred = full_model(image_batch, text_batch, hate_words_batch)

            distances = (pred - target_batch) ** 2

            kk = validAccuracy(pred, target_batch)
            acc += kk.sum()

            for j, x in enumerate(distances):
                top_losses.append([paths_batch[j], x])
                fewer_losses.append([paths_batch[j], x])

            top_losses.sort(key=getLossFromTuple, reverse=True)
            fewer_losses.sort(key=getLossFromTuple, reverse=False)

            top_losses = top_losses[:TOP_SIZE]
            fewer_losses = fewer_losses[:TOP_SIZE]

            size = target_batch.numel()
            loss += criterion(pred, target_batch) * size

            i += target_batch.numel()


    valid_acc = acc.float()/i
    valid_mse = loss/i
    print('acc', acc)
    print('i', i)
    return valid_acc, valid_mse



In [None]:
import test
if __name__ == '__main__':

    HIDDEN_SIZE = 50
    N_EPOCHS = 100
    BATCH_SIZE = 25

    UNFREEZE_FEATURES = 999

    USE_IMAGE = 1
    USE_TEXT = 1
    USE_HATE_WORDS = 0

    TRAIN_METADATA_HATE = "hateMemesList.txt.train"
    TRAIN_METADATA_GOOD = "redditMemesList.txt.train"
    VALID_METADATA_HATE = "hateMemesList.txt.valid"
    VALID_METADATA_GOOD = "redditMemesList.txt.valid"
    BASE_PATH = "/content/data/train_data"

    MODEL_SAVE = "models/classifier.pt"

    logname = "logs_final_BS25/multimodal3"

    # checkpoint = "models/unsupervised_pretrain.pt"
    checkpoint = None


    start_time = time.time()
    writer = SummaryWriter("logs/" + logname)

    # Configuring CUDA / CPU execution
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('device: ', device)

    # Keywords (deprecated)
    hate_list = [
            'ali baba',
            'allah',
            'abbo',
            'black',
            'bomb',
            'dynamite',
            'jew',
            'nazi',
            'niglet',
            'nigger',
            'nigga',
            'paki',
        ]


    # Get image descriptor
    VGG16_features = torchvision.models.vgg16(pretrained=True)
    VGG16_features.classifier = VGG16_features.classifier[:-3]

    VGG16_features.to(device)

    # To embed text, we use a Pytorch implementation of BERT: Using pythorch BERT implementation from https://github.com/huggingface/pytorch-pretrained-BERT
    # Get Textual Tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    # Get Textual Embedding.
    bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")
    bert_model.eval()
    bert_model.to(device)

    # IMAGE_AND_TEXT_FEATURES = 1768
    IMAGE_FEATURES = 4096
    TEXT_FEATURES = 768
    HATE_WORDS = len(hate_list)

    IMAGE_AND_TEXT_FEATURES = IMAGE_FEATURES * USE_IMAGE + TEXT_FEATURES * USE_TEXT\
                              + HATE_WORDS * USE_HATE_WORDS

    full_model = MultimodalClassifier(VGG16_features, bert_model,
                                      IMAGE_AND_TEXT_FEATURES, USE_IMAGE,
                                      USE_TEXT, USE_HATE_WORDS, HIDDEN_SIZE,
                                      device)

    if checkpoint is not None:
        full_model.load_state_dict(torch.load(checkpoint))

    full_model.to(device)

    # transform = transforms.Compose([test.Rescale((256, 256)),
    transform = transforms.Compose([test.Rescale((224, 224)),
                                    # test.RandomCrop(224),
                                    test.HateWordsVector(hate_list),
                                    test.Tokenize(tokenizer),
                                    test.ToTensor()])

    transformValid = transforms.Compose([test.Rescale((224, 224)),
                                    # test.RandomCrop(224),
                                    test.HateWordsVector(hate_list),
                                    test.Tokenize(tokenizer),
                                    test.ToTensor()])

    train_dataset = test.ImagesDataLoader(TRAIN_METADATA_GOOD, TRAIN_METADATA_HATE, BASE_PATH, transform)
    valid_dataset = test.ImagesDataLoader(VALID_METADATA_GOOD, VALID_METADATA_HATE, BASE_PATH, transformValid)
    # train_dataset = test.ImageTextMatcherDataLoader(TRAIN_METADATA_GOOD, TRAIN_METADATA_HATE, BASE_PATH, transform)
    # valid_dataset = test.ImageTextMatcherDataLoader(VALID_METADATA_GOOD, VALID_METADATA_HATE, BASE_PATH, transformValid)

    DATASET_LEN = train_dataset.__len__()

    dataloader_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=test.custom_collate)
    dataloader_valid = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=test.custom_collate)


    # criterion = nn.CrossEntropyLoss()
    criterion = nn.MSELoss()

    # feature_parameters = list(full_model.im_feat_model.classifier.parameters()) + list(bert_model.parameters())
    #
    # optimizer = torch.optim.SGD(parameters, lr=0.01, momentum=0.9)
    # optimizer = torch.optim.SGD(full_model.parameters(), lr=0.01, momentum=0.9)
    optimizer = torch.optim.Adam(full_model.classifier.parameters())
    # features_optimizer = torch.optim.Adam(feature_parameters)
    features_optimizer = torch.optim.Adam(VGG16_features.classifier.parameters())


    iteration = 0

    # best_acc = np.array(-1.00)
    best_acc=torch.tensor(-1.0,dtype=torch.float32)
    full_model.text_feat_model.train()
    full_model.im_feat_model.train()

    for i in range(N_EPOCHS):

        epoch_init = time.time()
        pbar = tqdm(total=DATASET_LEN)
        for batch in dataloader_train:
            image_batch = batch["image"].to(device)
            text_batch = batch["bert_tokens"].to(device)
            hate_words_batch = batch["hate_words"].to(device)

            target_batch = batch["class"].to(device)

            target_batch = target_batch.unsqueeze(1)

            optimizer.zero_grad()
            features_optimizer.zero_grad()

            pred = full_model(image_batch, text_batch, hate_words_batch)

            loss = criterion(pred, target_batch)

            loss.backward()


            optimizer.step()

            if i >= UNFREEZE_FEATURES:
                features_optimizer.step()

            writer.add_scalar('train/mse', loss, iteration*BATCH_SIZE)
            iteration += 1

            pbar.update(BATCH_SIZE)


        epoch_end = time.time()

        print("Epoch time elapsed:", epoch_end - epoch_init)

        print("Starting Validation")

        valid_init = time.time()

        full_model.eval()
        full_model.text_feat_model.eval()
        full_model.im_feat_model.eval()
        valid_acc, valid_loss = validate(dataloader_valid, criterion, device)
        full_model.text_feat_model.train()
        full_model.im_feat_model.train()
        full_model.train()

        valid_acc_np = valid_acc.cpu().numpy()
        valid_acc_tensor=torch.tensor(valid_acc_np,dtype=torch.float32)

        if valid_acc_tensor > best_acc:
            print("Saving full model to " + MODEL_SAVE + ".best")
            torch.save(full_model.state_dict(), MODEL_SAVE + '.best')
            logfile = open(MODEL_SAVE + ".best.log", "w")
            best_acc = valid_acc_tensor
            logfile.write("best_acc:" + str(valid_acc_tensor.item())+"\n" + "best epoch: " + str(i) + "\n")
            logfile.close()
            best_acc = valid_acc

            accs = open("results/accuracys", "w")

            accs.write("Smaller losses from best acc (epoch : " + str(i) + ")\n")

            for x in fewer_losses:

                accs.write(str(x[0]) + "\t" + str(x[1]) + "\n")

            accs.write("Top Loss:\n")
            for x in top_losses:
                accs.write(str(x[0]) + "\t" + str(x[1]) + "\n")

            accs.close()

        valid_end = time.time()
        print("Time in validation:", valid_end - valid_init)
        writer.add_scalar('validation/valid_accuracy', valid_acc, i+1)
        writer.add_scalar('validation/valid_mse', valid_loss, i+1)

    end_time = time.time()
    print("Elapsed Time:", end_time - start_time)

    accs.close()

    writer.close()



device:  cpu


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:07<00:00, 74.5MB/s]
100%|██████████| 995526/995526 [00:00<00:00, 8137262.65B/s]
100%|██████████| 625/625 [00:00<00:00, 1119794.96B/s]
100%|██████████| 714314041/714314041 [00:31<00:00, 22423515.85B/s]


FileNotFoundError: ignored

In [None]:
import torch
import cv2
import pytesseract
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer
# from train import MultimodalClassifier  # Import the MultimodalClassifier class from train.py

# Define the path to your trained model
model_path = "/content/drive/MyDrive/classifier.pt.best"

# Load the trained model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = MultimodalClassifier(VGG16_features, bert_model,
                                      IMAGE_AND_TEXT_FEATURES, USE_IMAGE,
                                      USE_TEXT, USE_HATE_WORDS, HIDDEN_SIZE,
                                      device)  # Provide the same arguments as used during training
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()  # Set the model to evaluation mode

# Define the image preprocessing transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load and preprocess the input image
image_path = "/content/26514.jpg"
input_image = cv2.imread(image_path)
text = pytesseract.image_to_string(input_image, config='--oem 1')
input_image = Image.fromarray(cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB))  # Convert to PIL Image
input_image = transform(input_image)
input_image = input_image.unsqueeze(0).to(device)  # Add batch dimension and move to device

# Preprocess the text and convert it into a tensor
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokens = tokenizer.tokenize(text)
tokens = tokens[:48]
tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
text_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)
# Perform inference
with torch.no_grad():
    output = model(input_image, text=text_tensor, hate_words=None)

# Get the predicted probability
predicted_probability = torch.sigmoid(output).item()  # Use sigmoid if it's a probability
predicted_probability = round(predicted_probability, 1)
# Classify as non-offensive or offensive
if predicted_probability <= 0.5:
    classification = "Non-offensive"
else:
    classification = "Offensive"

# Print the result
# print("Predicted Probability:", predicted_probability)
print("Classification:", classification)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Classification: Non-offensive


In [13]:
import torch
import cv2
import pytesseract
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer

def classify_image_from_image(image):
    # Define the path to your trained model
    model_path = "/content/drive/MyDrive/classifier.pt.best"

    # Load the trained model
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = MultimodalClassifier(VGG16_features, bert_model,
                                      IMAGE_AND_TEXT_FEATURES, USE_IMAGE,
                                      USE_TEXT, USE_HATE_WORDS, HIDDEN_SIZE,
                                      device)  # Provide the same arguments as used during training
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()  # Set the model to evaluation mode

    # Define the image preprocessing transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Preprocess the input image
    text = pytesseract.image_to_string(image, config='--oem 1')
    image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))  # Convert to PIL Image
    image = transform(image)
    image = image.unsqueeze(0).to(device)  # Add batch dimension and move to device

    # Preprocess the text and convert it into a tensor
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:48]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    text_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

    # Perform inference
    with torch.no_grad():
        output = model(image, text=text_tensor, hate_words=None)

    # Get the predicted probability
    predicted_probability = torch.sigmoid(output).item()  # Use sigmoid if it's a probability
    predicted_probability = round(predicted_probability, 1)

    # Classify as non-offensive or offensive
    if predicted_probability <= 0.5:
        classification = "Non-offensive"
    else:
        classification = "Offensive"

    return classification

# Example usage:
image_path = "/content/26514.jpg"
input_image = cv2.imread(image_path)
result = classify_image_from_image(input_image)
print("Classification:", result)

Classification: Non-offensive


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.48.0-py3-none-any.whl (20.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.103.2-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.6.1 (from gradio)
  Downloading gradio_client-0.6.1-py3-none-any.whl (299 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.2/299.2 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.25.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
import gradio as gr
iface = gr.Interface(fn=classify_image_from_image, inputs="image", outputs="text")
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e342c506c486904568.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


