In [1]:
%run helpers.py

In [2]:
import open_clip

In [None]:
import numpy as np
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from PIL import Image

uri = meme_training_image('110005.jpeg')

clip_embd = OpenCLIPEmbeddings(model_name="ViT-g-14", checkpoint="laion2b_s34b_b88k")
emb = clip_embd.embed_image([uri])
emb

In [6]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://postgres:postgres@127.0.0.1:54322/postgres"  # Uses psycopg3!
collection_name = "memes"

vectorstore = PGVector(
    embeddings=clip_embd,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [11]:
tr = meme_training()
tr['110001']

{'id_EXIST': '110001',
 'lang': 'es',
 'text': '2+2=5 MITO Albert Einstein tenía bajo rendimiento en la escuela. VERDAD 2+2=4 CAN is El feminismo de hoy en día defiende la estupidez humana y no los derechos de las mujeres quemo ellas afirman ',
 'meme': '110001.jpeg',
 'path_memes': 'memes/110001.jpeg',
 'number_annotators': 6,
 'annotators': ['Annotator_1',
  'Annotator_2',
  'Annotator_3',
  'Annotator_4',
  'Annotator_5',
  'Annotator_6'],
 'gender_annotators': ['F', 'F', 'F', 'M', 'M', 'M'],
 'age_annotators': ['18-22', '23-45', '46+', '46+', '18-22', '23-45'],
 'ethnicities_annotators': ['Hispano or Latino',
  'Hispano or Latino',
  'Hispano or Latino',
  'White or Caucasian',
  'Hispano or Latino',
  'Hispano or Latino'],
 'study_levels_annotators': ['High school degree or equivalent',
  'Master’s degree',
  'Master’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree'],
 'countries_annotators': ['Mexico',
  'Spain',
  'Argentina',
  'Spain',
  'Mexico',


In [8]:
import re
import uuid

def vote(item):
    return "YES" if item["labels_task4"].count("YES") > item["labels_task4"].count("NO") else "NO"

def to_document(item):
    return Document(
        page_content=item['text'],
        metadata={"id": str(uuid.uuid4()), "task4": vote(item), 'document_id': item['id_EXIST'], 'lang': item['lang']})

In [12]:
docs = list(map(to_document, tr.values()))
vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

['7c1f860d-3468-4944-8cf3-42d8ec7f103c',
 '6112446e-7c38-47ac-88a0-e8978bdcedad',
 '14c23376-18af-4f89-aa09-8c17c2d030c0',
 'f06cbb09-aa14-44b3-be4c-cfc0c16c0266',
 '0b638f55-a6ab-4283-a00a-1bba3eb506ab',
 '5c6aba04-f1a6-4d5b-aa12-9ecb885cb55d',
 'bdb83a3f-d3cd-4fe7-9cbd-655991887c3e',
 '1eb4d4c1-dbf8-4d93-8895-a7864f1cbb94',
 '013db977-56fd-49de-8c18-50bec185f940',
 '01e99691-b166-492b-8637-41fc6cfe9b29',
 '2983bd93-6bc6-43ba-aa10-7c2cc62031d1',
 '6bd3e2f6-b2b1-4d28-ab5d-476936fef26b',
 '1955cf0c-8dd9-4513-94f0-7248b1e2c660',
 '9abe9b1e-3a2f-4cf8-9171-2fe230685e5f',
 'a134cf70-1c49-4d4d-9784-edbe7a2ec2a0',
 '5935c64b-768b-4301-a75f-295508ffb8c7',
 '6ca5e10f-bb20-4454-9557-31eab4202143',
 '3d42a43c-b9eb-4afd-a0eb-2c84d716af12',
 '7af05b12-f955-48c2-8ce1-a722762dee96',
 '5bf5437c-de9a-49bd-b4ba-b2304c639f03',
 '81a36094-907c-48d7-9f25-899204257acb',
 '450206c9-ff57-4556-8580-682baaac2f34',
 'daca2fba-c23f-469e-8c17-49d9c5059dd2',
 '430629d7-a733-4e84-989b-903bf7b2e9c7',
 '6c1937ab-c083-

In [16]:
vectorstore.similarity_search(tr['110001']['text'], k=1)[0]

Document(page_content='2+2=5 MITO Albert Einstein tenía bajo rendimiento en la escuela. VERDAD 2+2=4 CAN is El feminismo de hoy en día defiende la estupidez humana y no los derechos de las mujeres quemo ellas afirman ', metadata={'id': '7c1f860d-3468-4944-8cf3-42d8ec7f103c', 'lang': 'es', 'task4': 'YES', 'document_id': '110001'})

In [13]:
def predict(query):
    document = vectorstore.similarity_search(query, k=1)[0]
    return {
        'test_case': "EXIST2024",
        'id': document.metadata['document_id'],
        'value': document.metadata['task4']
    }

In [39]:
dev = tweet_dev()
results = apply_predict_to(dev, lambda item: predict(item['tweet']))

In [40]:
file = open("EXIST2024_dev_task1_majority_class_hard.json", "w")
json.dump(results, file, indent = 6)
file.close()

# Task 1

In [41]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

baseline = tweet_baseline("EXIST2024_dev_task1_majority_class_hard.json")
gold = tweet_golds("EXIST2024_dev_task1_gold_hard.json")
test = PyEvALLEvaluation()
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
metrics=["ICM", "ICMNorm" ,"FMeasure", "Accuracy", "Precision", "Recall"]
report= test.evaluate(baseline, gold, metrics, **params)
report.print_report()

2024-05-03 15:55:16,793 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure', 'Accuracy', 'Precision', 'Recall']
2024-05-03 15:55:16,854 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:17,004 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-05-03 15:55:17,005 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:17,153 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:17,296 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-05-03 15:55:17,439 - pyevall.metrics.metrics - INFO -             evaluate() - Executing accuracy evaluation method
2024-05-03 15:55:17,439 - pyevall.metrics.metrics - INFO -             evaluate() - Executing precision e

In [42]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = 'EXIST2024_dev_task1_majority_class_hard.json'
gold = tweet_golds("EXIST2024_dev_task1_gold_hard.json")
test = PyEvALLEvaluation()
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
metrics=["ICM", "ICMNorm" ,"FMeasure", "Accuracy", "Precision", "Recall"]
report= test.evaluate(predictions, gold, metrics, **params)
report.print_report()

2024-05-03 15:55:21,825 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure', 'Accuracy', 'Precision', 'Recall']
2024-05-03 15:55:21,888 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:22,009 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-05-03 15:55:22,010 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:22,138 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-03 15:55:22,290 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-05-03 15:55:22,415 - pyevall.metrics.metrics - INFO -             evaluate() - Executing accuracy evaluation method
2024-05-03 15:55:22,415 - pyevall.metrics.metrics - INFO -             evaluate() - Executing precision e