Separate download scripts, feedback session (#55)

snorkel-team · Aug 13, 2019 · f0bcb1d · f0bcb1d
1 parent 6aa2c28
commit f0bcb1d
Show file tree

Hide file tree

Showing 14 changed files with 219 additions and 134 deletions.
diff --git a/README.md b/README.md
@@ -5,14 +5,10 @@ A collection of tutorials for [Snorkel](http://snorkel.org).
 The Snorkel tutorials are grouped by application, with some applications having multiple associated notebooks in their directory.
 * `spam`: Is this YouTube comment spam?
 * `spouse`: Does this sentence imply that the two marked people are spouses?
-* `scene_graph`: Is object A riding object B in the image, carrying it, or neither?
+* `visual_relation`: Is object A riding object B in the image, carrying it, or neither?
 * `crowdsourcing`: Is this tweet about the weather expressing a positive, negative or neutral sentiment?
-<<<<<<< HEAD
-* `multitask` (Multi-Task Learning): A synthetic task demonstrating the native Snorkel multi-task classifier API
-=======
 * `mtl` (Multi-Task Learning): A synthetic task demonstrating the native Snorkel multi-task classifier API
 * [`drybell`](https://ai.googleblog.com/2019/03/harnessing-organizational-knowledge-for.html): Is a celebrity mentioned in this news article?
->>>>>>> Add to tox and README
 
 See the [Tutorials Index](#tutorials-index) for a listing of which tutorials demonstrate which task types, techniques, and integrations.
 
@@ -144,14 +140,13 @@ Here we provide an index pointing to different available tutorials by their task
 * Task
     * Text Classification (Text): `spam`, `crowdsourcing`, `drybell`
     * Relation Extraction (Text): `spouse`
-    * Visual Relationship Detection (Image): `scene_graph`
+    * Visual Relationship Detection (Image): `visual_relation`
 * Techniques
-    * Labeling with Labeling Functions (LFs): `spam`, `spouse`, `scene_graph`, `crowdsourcing`, `drybell`
+    * Labeling with Labeling Functions (LFs): `spam`, `spouse`, `visual_relation`, `crowdsourcing`
     * Augmentation with Transformation Functions (TFs): `spam`
     * Monitoring with Slicing Functions (SFs): `spam`
     * Using Crowd Worker Labels: `crowdsourcing`
-    * Multi-Task Learning (MTL): `multitask`, `scene_graph`, `spam`
-    * Large-Scale Production Pipeline: `drybell`
+    * Multi-Task Learning (MTL): `multitask`, `visual_relation`, `spam`
 * Integrations
     * TensorFlow/Keras: `spam`, `spouse`
     * Scikit-Learn: `spam`, `crowdsourcing`

diff --git a/scene_graph/.notebooks b/scene_graph/.notebooks
diff --git a/spam/01_spam_tutorial.ipynb b/spam/01_spam_tutorial.ipynb
diff --git a/spam/01_spam_tutorial.py b/spam/01_spam_tutorial.py
@@ -1006,7 +1006,7 @@ def plot_probabilities_histogram(Y):
 # If you enjoyed this tutorial and you've already checked out the Snorkel 101 Guide, check out the [`snorkel-tutorials` table of contents](https://github.com/snorkel-team/snorkel-tutorials#snorkel-tutorials) for other tutorials that you may find interesting, including demonstrations of how to use Snorkel
 #
 # * As part of a [hybrid crowdsourcing pipeline](https://github.com/snorkel-team/snorkel-tutorials/tree/master/crowdsourcing)
-# * For [scene-graph detection over images](https://github.com/snorkel-team/snorkel-tutorials/tree/master/scene_graph)
+# * For [scene-graph detection over images](https://github.com/snorkel-team/snorkel-tutorials/tree/master/visual_relation)
 # * For [information extraction over text](https://github.com/snorkel-team/snorkel-tutorials/tree/master/spouse)
 # * For [data augmentation](https://github.com/snorkel-team/snorkel-tutorials/tree/master/spam)
 #

diff --git a/tox.ini b/tox.ini
@@ -3,6 +3,7 @@ skipsdist = true
 envlist =
     spouse,
     intro,
+    visual_relation,
     spam,
     scene_graph,
     crowdsourcing,
@@ -19,7 +20,7 @@ deps =
     intro: -rintro/requirements.txt
     spam: -rspam/requirements.txt
     multitask: -rmultitask/requirements.txt
-    scene_graph: -rscene_graph/requirements.txt
+    visual_relation: -rvisual_relation/requirements.txt
     crowdsourcing: -rcrowdsourcing/requirements.txt
     recsys: -rrecsys/requirements.txt
     drybell: -rdrybell/requirements.txt
@@ -32,7 +33,7 @@ commands =
     intro: python {toxinidir}/scripts/build.py {posargs:test} intro
     spam: python {toxinidir}/scripts/build.py {posargs:test} spam
     multitask: python {toxinidir}/scripts/build.py {posargs:test} multitask
-    scene_graph: python {toxinidir}/scripts/build.py {posargs:test} scene_graph
+    visual_relation: python {toxinidir}/scripts/build.py {posargs:test} visual_relation
     crowdsourcing: python {toxinidir}/scripts/build.py {posargs:test} crowdsourcing
     drybell: python {toxinidir}/scripts/build.py {posargs:test} drybell
 

diff --git a/visual_relation/.notebooks b/visual_relation/.notebooks
@@ -0,0 +1 @@
+visual_relation_tutorial
diff --git a/scene_graph/__init__.py → visual_relation/__init__.py b/scene_graph/__init__.py → visual_relation/__init__.py
diff --git a/visual_relation/download_full_data.sh b/visual_relation/download_full_data.sh
@@ -0,0 +1,37 @@
+# Execute from snorkel-tutorials/
+# Download data,
+
+ANNOTATIONS_URL="https://www.dropbox.com/s/bnfhm6kt9xumik8/vrd.zip"
+IMAGES_URL="http://imagenet.stanford.edu/internal/jcjohns/visual_relations/sg_dataset.zip"
+SAMPLE_IMAGES_URL="https://github.com/Prof-Lu-Cewu/Visual-Relationship-Detection.git"
+GLOVE_URL="http://nlp.stanford.edu/data/wordvecs/glove.6B.zip"
+
+if [ ! -d "visual_relation/data" ]; then
+    mkdir -p visual_relation/data
+    cd visual_relation/data
+
+    # download and unzip metadata and annotations
+    wget $ANNOTATIONS_URL
+    unzip vrd.zip
+
+    # Delete the zip files.
+    rm vrd.zip
+    cd VRD
+
+    # Download and unzip all images
+    wget $IMAGES_URL
+    unzip sg_dataset.zip
+    rm sg_dataset.zip
+    cd ../../..
+
+    mkdir -p visual_relation/data/glove
+    cd visual_relation/data/glove
+
+    wget $GLOVE_URL
+    unzip glove.6B.zip
+
+    # Delete the zip files
+    rm  glove.6B.zip
+    cd ../../..
+fi
+
diff --git a/scene_graph/download_data.sh → visual_relation/download_sample_data.sh b/scene_graph/download_data.sh → visual_relation/download_sample_data.sh
@@ -12,20 +12,20 @@ DIRS=("glove" "VRD/sg_dataset/samples")
 # Check if at least any file is missing. If so, reload all data.
 for directory_name in "${DIRS[@]}"
 do
-    if [ ! -d "scene_graph/data/$directory_name" ]; then
+    if [ ! -d "visual_relation/data/$directory_name" ]; then
         RELOAD=true
     fi
 done
 
 ANNOTATIONS_URL="https://www.dropbox.com/s/bnfhm6kt9xumik8/vrd.zip"
-IMAGES_URL="http://imagenet.stanford.edu/internal/jcjohns/scene_graphs/sg_dataset.zip"
+IMAGES_URL="http://imagenet.stanford.edu/internal/jcjohns/visual_relations/sg_dataset.zip"
 SAMPLE_IMAGES_URL="https://github.com/Prof-Lu-Cewu/Visual-Relationship-Detection.git"
 GLOVE_URL="http://nlp.stanford.edu/data/wordvecs/glove.6B.zip"
 
-if [ ! -d "scene_graph/data" ]; then
-    if [ -d "scene_graph/data/" ]; then rm -Rf "scene_graph/data/"; fi
-    mkdir -p scene_graph/data
-    cd scene_graph/data
+if [ ! -d "visual_relation/data" ]; then
+    if [ -d "visual_relation/data/" ]; then rm -Rf "visual_relation/data/"; fi
+    mkdir -p visual_relation/data
+    cd visual_relation/data
 
     # download and unzip metadata and annotations
     wget $ANNOTATIONS_URL
@@ -35,32 +35,16 @@ if [ ! -d "scene_graph/data" ]; then
     rm vrd.zip
     cd VRD
 
-    # if [ "$TRAVIS" = "true" ]; then
-    #     # Download and unzip sample images
-    #     mkdir sg_dataset
-    #     cd sg_dataset
-    #     git clone $SAMPLE_IMAGES_URL
-    #     mv Visual-Relationship-Detection/samples ./
-    #     rm -r Visual-Relationship-Detection
-    #     cd ../..
-    # else
-    #     # Download and unzip all images
-    #     wget $IMAGES_URL
-    #     unzip sg_dataset.zip
-    #     rm sg_dataset.zip
-    #     cd ../../..
-    # fi
-
-    # Download and unzip all images
+    # Download and unzip sample images
     mkdir sg_dataset
     cd sg_dataset
     git clone $SAMPLE_IMAGES_URL
     mv Visual-Relationship-Detection/samples ./
     rm -r Visual-Relationship-Detection
     cd ../../../..
 
-    mkdir -p scene_graph/data/glove
-    cd scene_graph/data/glove
+    mkdir -p visual_relation/data/glove
+    cd visual_relation/data/glove
 
     wget $GLOVE_URL
     unzip glove.6B.zip

diff --git a/scene_graph/model.py → visual_relation/model.py b/scene_graph/model.py → visual_relation/model.py
@@ -75,7 +75,7 @@ def __init__(
             "sub_category": df["subject_category"].tolist(),
         }
         Y_dict = {
-            "scene_graph_task": torch.LongTensor(df["label"].to_numpy())
+            "visual_relation_task": torch.LongTensor(df["label"].to_numpy())
         }  # change to take in the rounded train labels
         super(SceneGraphDataset, self).__init__(name, split, X_dict, Y_dict)
 
@@ -123,7 +123,7 @@ def __len__(self):
 class WordEmb(nn.Module):
     """Extract and concat word embeddings for obj and sub categories."""
 
-    def __init__(self, glove_fn="scene_graph/data/glove/glove.6B.100d.txt"):
+    def __init__(self, glove_fn="visual_relation/data/glove/glove.6B.100d.txt"):
         super(WordEmb, self).__init__()
 
         self.word_embs = pandas.read_csv(

diff --git a/scene_graph/requirements.txt → visual_relation/requirements.txt b/scene_graph/requirements.txt → visual_relation/requirements.txt
diff --git a/scene_graph/utils.py → visual_relation/utils.py b/scene_graph/utils.py → visual_relation/utils.py
@@ -55,24 +55,42 @@ def vrd_to_pandas(
 
 
 # %%
-def load_vrd_data():
+def load_vrd_data(sample=False, is_travis=False):
     """Download and load Pandas DataFrame of VRD relationships.
 
     NOTE: Only loads semantic relationship examples.
     """
-    try:
-        subprocess.run(
-            ["bash", "scene_graph/download_data.sh"], check=True, stderr=subprocess.PIPE
-        )
-    except subprocess.CalledProcessError as e:
-        print(e.stderr.decode())
-        raise e
-
-    relationships_train = json.load(open("scene_graph/data/VRD/annotations_train.json"))
-    relationships_test = json.load(open("scene_graph/data/VRD/annotations_test.json"))
 
-    objects = json.load(open("scene_graph/data/VRD/objects.json"))
-    predicates = json.load(open("scene_graph/data/VRD/predicates.json"))
+    if sample or is_travis:
+        try:
+            subprocess.run(
+                ["bash", "visual_relation/download_sample_data.sh"],
+                check=True,
+                stderr=subprocess.PIPE,
+            )
+        except subprocess.CalledProcessError as e:
+            print(e.stderr.decode())
+            raise e
+    else:
+        try:
+            subprocess.run(
+                ["bash", "visual_relation/download_full_data.sh"],
+                check=True,
+                stderr=subprocess.PIPE,
+            )
+        except subprocess.CalledProcessError as e:
+            print(e.stderr.decode())
+            raise e
+
+    relationships_train = json.load(
+        open("visual_relation/data/VRD/annotations_train.json")
+    )
+    relationships_test = json.load(
+        open("visual_relation/data/VRD/annotations_test.json")
+    )
+
+    objects = json.load(open("visual_relation/data/VRD/objects.json"))
+    predicates = json.load(open("visual_relation/data/VRD/predicates.json"))
     semantic_predicates = [
         "carry",
         "cover",
@@ -99,9 +117,9 @@ def load_vrd_data():
     }
 
     # TODO: hack to work with small sample of data for tox
-    if os.path.isdir("scene_graph/data/VRD/sg_dataset/samples"):
+    if os.path.isdir("visual_relation/data/VRD/sg_dataset/samples"):
         # pass in list of images as keys_list
-        keys_list = os.listdir("scene_graph/data/VRD/sg_dataset/samples")
+        keys_list = os.listdir("visual_relation/data/VRD/sg_dataset/samples")
         test_df = vrd_to_pandas(
             relationships_test,
             objects,
@@ -110,7 +128,7 @@ def load_vrd_data():
             keys_list=keys_list,
         )
         return test_df, test_df, test_df
-    elif os.path.isdir("scene_graph/data/VRD/sg_dataset/sg_train_images"):
+    elif os.path.isdir("visual_relation/data/VRD/sg_dataset/sg_train_images"):
         train_df = vrd_to_pandas(
             relationships_train,
             objects,