In [7]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser
import wandb
import requests
import json
from io import BytesIO
import svgling
# from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import tqdm
from nltk.corpus import wordnet as wn
import nltk
# Initialize NLTK


nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

wandb.login(key = "f120e5e4c8c84329e87f496f85e6f7ded7732680")

concrete_lexnames = {
    'noun.animal', 'noun.artifact', 'noun.body', 'noun.food', 'noun.group',
    'noun.location', 'noun.object', 'noun.person', 'noun.plant', 'noun.substance', 
}
grammar = r"""
  NP: {<DT>?<JJ>*<NNS|NNPS>+}
"""

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/susu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/susu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/susu/.netrc


In [8]:
def wandb_log(url,obj, caption):
    try:
        img_response = requests.get(url, timeout=10)
    except requests.exceptions.Timeout:
        print("The request timed out. Skipping this image.")
        return
    img = Image.open(BytesIO(img_response.content))
        
    wandb.init(project="flicker_raw_images", entity="ruisu")
    wandb.log({
        "url": url,
        'caption': caption,
        'number': "ten",
        "image":wandb.Image(img, caption=caption),
        "object":obj,
        "dataset_name":None,
        "org_id":None
    })
    wandb.finish()

In [1]:
def is_concrete_noun(word):
    synsets = wn.synsets(word, pos=wn.NOUN)
    for synset in synsets:
        # print(word, synset.lexname() )
        print(word, synset.lexname())
        if synset.lexname() in concrete_lexnames:
            return True
    return False


def is_concrete_noun_phrase(phrase):
    tokens = word_tokenize(phrase)
    tagged = pos_tag(tokens)
    for token, tag in tagged:
        if tag in ['NN', 'NNS', 'NNP', 'NNPS']:  # Check if the token is a noun
            if is_concrete_noun(token):
                return True
    return False

def find_plural_noun_phrases(text,num_word):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    # Create a parser with our grammar
    cp = RegexpParser(grammar)
    tree = cp.parse(tagged)

    results = []
    current_position = 0
    previous_word = None

    # Iterate through the tree to find number words followed by NP chunks
    print(tree)
    for node in tree:
        if isinstance(node, nltk.Tree):
            if node.label() == 'NP':
                
                if previous_word and previous_word == num_word:
                    np_text = ' '.join(token for token, pos in node.leaves())
                    if is_concrete_noun_phrase(np_text):
                        results.append((previous_word, np_text))
                elif node.leaves()[0][0] == num_word:
                    np_text = ' '.join(token for token, pos in node.leaves()[1:])
                    if is_concrete_noun_phrase(np_text):
                        results.append((num_word, np_text))
            current_position += len(node.leaves())
        else:
            token, tag = node
            previous_word = token
            current_position += 1

    return results

def process_sample(photo,dataset_name,numbers):
    caption = photo["caption"]

    num_candidates = []
    for number in numbers:
        if number in caption.split(" "):
            num_candidates.append(number)
    # print("len(num_candidates)",)
    if len(num_candidates) == 0:
        return "no number"

    # api = wandb.Api()
    # runs = api.runs(path="ruisu/flicker_raw_images")
    # exist = False
    # for run in runs:
    #     if run.config.get("url") == photo["url"]:
    #         exist = True
    #         print(f"Already exists: {photo['url']}")
    #         break

    # if exist:
    #     return "exist"

    for number in num_candidates:

        found_phrases = find_plural_noun_phrases(caption,number)
        print("found_phrases",found_phrases)
        print("caption",caption)
        if len(found_phrases)<= 0:
            continue
        try:
        
            try:
                # Set a timeout of 10 seconds
                img_response = requests.get(photo["url"], timeout=10)
            except requests.exceptions.Timeout:
                # Handle the timeout exception
                print("The request timed out. Skipping this image.")
                continue
            img = Image.open(BytesIO(img_response.content))
            # inputs = processor(text=[caption], images=[img], return_tensors="pt", padding=True)
            # outputs = model(**inputs)
            # if outputs.logits_per_image.item() < 30:  # threshold can be adjusted
            #     continue

            wandb.init(project="flicker_raw_images", entity="ruisu")
            wandb.log({
                "url": photo["url"],
                'caption': caption,
                'number': number,
                "image":wandb.Image(img, caption=caption),
                "object":found_phrases[0][1],
                "dataset_name":dataset_name,
                "org_id":photo["org_id"]
            })
            wandb.finish()
            return "sucess"

        except Exception as e:
            print(f"Failed to process image {photo['url']}: {e}")
            continue
    return "wrong grammar"

In [None]:
#@title conceptual_12m
from datasets import load_dataset
import tqdm
dataset = load_dataset("conceptual_12m",split="train",streaming=True)
numbers = ["six", "seven", "eight", "nine"]
# numbers = ["four"]
# check the lower bound of ten: 6292946
pbar = tqdm.tqdm(dataset)
for i, sample in enumerate(pbar):
    if i <= 7434944: #numbers = ["six", "seven", "eight", "nine", "ten"]
        continue
    # if i >= 315243:
    #     break
    photo = {
        "caption":sample["caption"].lower(),
        "url":sample["image_url"],
        "org_id":i
    }
    return_code = process_sample(photo,"conceptual_12m-train",numbers)

    pbar.set_description(f"Code: {return_code}")

In [None]:
current_json_list = [
    {
        "number": 10,
        "target": "apples",
        "target_context": "ten apples",
        "image_url": "https://media.baamboozle.com/uploads/images/189889/1636331500_50014_url.png",
        "dataset_name": "",
        "org_id": ""
    },
    {
        "number": 10,
        "target": "stars",
        "target_context": "ten stars",
        "image_url": "https://vt-vtwa-assets.varsitytutors.com/vt-vtwa/uploads/problem_question_image/image/28099/10.png",
        "dataset_name": "",
        "org_id": ""
    },
    {
        "number": 10,
        "target": "animals",
        "target_context": "ten cartoon animals",
        "image_url": "https://cdn.vectorstock.com/i/1000x1000/89/10/ten-cute-animals-vector-37188910.webp",
        "dataset_name": "",
        "org_id": ""
    },
    {
        "number": 10,
        "target": "animals",
        "target_context": "ten cute animals",
        "image_url": "https://img.freepik.com/premium-vector/set-10-cute-wild-animals_41303-28.jpg",
        "dataset_name": "",
        "org_id": ""
    }
]