In [2]:
%%writefile real_embeddings.py

import numpy as np
import pickle
import os
import hashlib

try:
    from sentence_transformers import SentenceTransformer
    TRANSFORMER_AVAILABLE = True
except:
    TRANSFORMER_AVAILABLE = False
    print("Install with: !pip install sentence-transformers")


class RealEmbeddingGenerator:
    def __init__(self, model_name="onlplab/alephbert-base", cache_dir="cache"):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        self.embedding_cache = {}


        if TRANSFORMER_AVAILABLE:
            self.model = SentenceTransformer(model_name)
            self.dimension = self.model.get_sentence_embedding_dimension()
        else:
            self.model = None
            self.dimension = 768

    def get_text_hash(self, text):
        return hashlib.md5(text.encode('utf-8')).hexdigest()

    def generate_embedding(self, text, use_cache=True):
        text_hash = self.get_text_hash(text)
        print("Fallback to random embeddings.")
        if use_cache and text_hash in self.embedding_cache:
            return self.embedding_cache[text_hash]

        if self.model:
            embedding = self.model.encode(text, convert_to_numpy=True)
        else:
            # Fallback to deterministic random
            np.random.seed(int(text_hash[:8], 16) % 10000)
            embedding = np.random.randn(self.dimension)

        if use_cache:
            self.embedding_cache[text_hash] = embedding



        return embedding

    def generate_batch(self, texts, show_progress=True):
        if self.model:
            return self.model.encode(texts,
                                    batch_size=32,
                                    show_progress_bar=show_progress,
                                    convert_to_numpy=True)
        else:
            return np.array([self.generate_embedding(t) for t in texts])

    def save_cache(self):
        cache_file = os.path.join(self.cache_dir, "embedding_cache.pkl")
        with open(cache_file, 'wb') as f:
            pickle.dump(self.embedding_cache, f)


Writing real_embeddings.py


In [None]:
# cosin similiarity for each photo with all chapter tables.
# and an API call to see if redundent photo with the max similarity.
# output 2 json files - 1. YES/NO (redundent or not) 2. YES/NO + explanation (reasoning)
# cost of about 7$ total  ###

!pip install sentence-transformers
!pip install anthropic

import os
import sys
sys.path.append("/content")
import json
import numpy as np
from real_embeddings import RealEmbeddingGenerator
import anthropic



# ---------- Utils ----------
def cosine_similarity(a, b):
    """Compute cosine similarity between two vectors."""
    a, b = np.array(a).flatten(), np.array(b).flatten()
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)

def normalize_scores(scores):
    """Normalize a list of scalar scores to 0-1 range."""
    scores = [float(s) for s in scores]
    if not scores:
        return scores
    min_s, max_s = min(scores), max(scores)
    if max_s - min_s < 1e-8:
        return [1.0] * len(scores)
    return [(s - min_s) / (max_s - min_s) for s in scores]

json_file_id = "redundant_graphs.json"
json_file_info = "redundant_graphs_info.json"

def analyze_graph_redundancy(year, chapter_id, table_json_url, graph_json_url, threshold=0.25, api_threshold=0.55):
    """
    Match tables and graphs for a single chapter and decide if graph is just analysis.
    """

    # Load JSONs
    with open(table_json_url, "r", encoding="utf-8") as f:
        tables = json.load(f)
    with open(graph_json_url, "r", encoding="utf-8") as f:
        graphs = json.load(f)

    # Filter for one chapter
    tables = {k: v for k, v in tables.items()
              if int(k.split("_")[2]) == int(year) and int(k.split("_")[1]) == int(chapter_id)}
    graphs = {k: v for k, v in graphs.items()
              if int(k.split("_")[2]) == int(year) and int(k.split("_")[1]) == int(chapter_id)}

    print(f"✅ Loaded {len(tables)} tables and {len(graphs)} graphs for chapter {chapter_id}\n")

    embedder = RealEmbeddingGenerator()
    table_keys, table_texts = list(tables.keys()), list(tables.values())
    graph_keys, graph_texts = list(graphs.keys()), list(graphs.values())

    # Embed all at once
    table_embs = embedder.generate_batch(table_texts, show_progress=True)
    graph_embs = embedder.generate_batch(graph_texts, show_progress=True)

    # Cloud API client
    client = anthropic.Anthropic(
        api_key= "" # replace with your actual API key
    )

    results = []
    efficiency = {"checked": 0, "sent_to_api": 0, "high_sim": 0}

    for gi, g_emb in enumerate(graph_embs):
        g_text, g_key = graph_texts[gi], graph_keys[gi]

        sims = [cosine_similarity(g_emb, t_emb) for t_emb in table_embs]
        sims_norm = normalize_scores(sims)

        avg_sim = float(np.mean(sims_norm))
        best_idx = int(np.argmax(sims))
        best_sim, best_table = sims[best_idx], table_texts[best_idx]
        efficiency["checked"] += 1

        print(best_sim, best_table)

        if best_sim >= threshold:
            efficiency["high_sim"] += 1

            prompt = f"""
            You are an expert in data analysis and visualization.

            Task:
            Decide if the graph is a redundant visualization of the table — meaning that all the information in the graph
            can be derived or regenerated directly from the table data (possibly with filtering, aggregation, or converting counts to percentages).

            Rules:
            - Answer "YES" if:
              * The graph uses only a subset of the variables, time ranges, or measures already present in the table.
              * The graph is a simpler or filtered view of the table (e.g., table has numbers and the graph shows percentages, or the table has multiple breakdowns and the graph shows just one).
            - Answer "NO" if:
              * The graph introduces any variable, breakdown, or dimension not included in the table (e.g., family size, gender, region, etc.).
              * The graph includes years or time periods not covered by the table.
              * The graph uses measures not derivable from the table.

            Important:
            We want to mark graphs as "YES" whenever they are redundant and can be safely removed without losing unique information.

            Return !STRICTLY! a JSON object:
            {{
                "is_analysis": "YES" or "NO",
                "explanation": "<short reasoning>"
            }}

            Table header (heb): "{best_table}"
            Graph header/title (heb): "{g_text}"
            """
            print("starting the API call")
            response = client.messages.create(
                # model="claude-3-haiku-20240307",
                model = "claude-3-7-sonnet-20250219",
                max_tokens=500,
                messages=[{"role": "user", "content": prompt}]
            )
            efficiency["sent_to_api"] += 1

            print("got answer")
            try:
                import re
                import ast

                raw_text = response.content[0].text.strip()
                print(raw_text)

                # Extract JSON-like substring that contains "is_analysis"
                match = re.search(r'(\{.*?"is_analysis".*?\})', raw_text, re.DOTALL)
                if match:
                    json_str = match.group(1)
                    try:
                        api_json = json.loads(json_str)
                    except Exception:
                        try:
                            api_json = ast.literal_eval(json_str)
                        except Exception:
                            api_json = {"is_analysis": "NO", "explanation": "Fallback parse failed."}
                else:
                    api_json = {"is_analysis": "NO", "explanation": "No JSON found in response."}
                    print("fallback")


                is_analysis = api_json.get("is_analysis", "NO")
                explanation = api_json.get("explanation", "")
            except Exception:
                is_analysis = "NO"
                explanation = "Could not parse API output safely."
                print("fallback")

        else:
            is_analysis = "NO"
            explanation = f"Similarity {best_sim:.2f} below threshold."

        # --- Save ID → YES/NO ---
        if os.path.exists(json_file_id):
            with open(json_file_id, "r", encoding="utf-8") as f:
                id_data = json.load(f)
        else:
            id_data = {}
        id_data[g_key] = is_analysis

        with open(json_file_id, "w", encoding="utf-8") as f:
            json.dump(id_data, f, ensure_ascii=False, indent=2)

        # --- Save full info object ---
        info_obj = {
        "table": best_table,
        "graph": g_text,
        "similarity": float(best_sim),  # ensure JSON serializable
        "is_analysis": is_analysis,
        "explanation": explanation
        }

        # Load existing data safely, or start with empty dict
        if os.path.exists(json_file_info):
            with open(json_file_info, "r", encoding="utf-8") as f:
                try:
                    info_data = json.load(f)
                    if not isinstance(info_data, dict):
                        info_data = {}  # fallback if JSON is a list or corrupted
                        print("fallback")
                except Exception:
                    info_data = {}
        else:
            info_data = {}

        # Save/update entry
        info_data[g_key] = info_obj

        # Write back to JSON
        with open(json_file_info, "w", encoding="utf-8") as f:
            json.dump(info_data, f, ensure_ascii=False, indent=2)

        print(f"Saved graph {g_key}: {is_analysis}")

    print("✅ All graphs processed and saved.")


## Run
for year in range(2016,2017):
  for chap in range(1,15):
    analyze_graph_redundancy(
          year=str(year),
          chapter_id=f"{chap:02d}",
          # chapter_id= "3",
          table_json_url="/content/tables_summary_all_cleaned.json",
          graph_json_url="/content/graphs_summary.json",
          threshold=0.7
      )






✅ Loaded 0 tables and 0 graphs for chapter 01



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches: 0it [00:00, ?it/s]

Batches: 0it [00:00, ?it/s]



✅ All graphs processed and saved.
✅ Loaded 16 tables and 11 graphs for chapter 02



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.8565646 לוח 2.7 גילם של ילדי עולים^ (0-17) (מספרים ואחוזים) 2005-2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph includes 'ילדים עולים' (immigrant children) as a separate category, which appears to be distinct from 'ילדי עולים' (children of immigrants) that the table focuses on. The graph also starts from 2006 while the table starts from 2005, suggesting they may contain different time periods or data points."
}
```
Saved graph 1_2_2016: NO
0.8774385 לוח  2.8 חלקם של ילדי עולים^ בכלל ילדי היישוב (יישובים בני 5,000 נפש ויותר)  (מספרים ואחוזים)  דצמבר 2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data from 2006-2015, but the table only includes data from December 2015. The graph includes historical data over a 10-year period that is not present in the table, making it non-redundant."
}
```
Saved graph 2_2_2016: NO
0.8483849 לוח 2.2 ילדים עולים^ לפי תקופת העלייה לישראל, 



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data specifically from 'קופת חולים מאוחדת' (Meuhedet Health Fund), which is not mentioned in the table. The table shows data from 'תחנות לבריאות המשפחה' (family health stations/Tipat Halav). These are different healthcare services, so the graph contains information not derivable from the table."
}
```
Saved graph 11_2_2016: NO
✅ All graphs processed and saved.
✅ Loaded 14 tables and 9 graphs for chapter 03



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.89893866 לוח 3.2 ילדים (0-17) במשפחות שבראשן הורה עצמאי^  לפי מצבו המשפחתי של ההורה (מספרים ואחוזים) 2015*
starting the API call
got answer
{
    "is_analysis": "NO",
    "explanation": "The graph shows a time series from 1995-2015, while the table only contains data for 2015. The graph provides historical data across multiple years that is not present in the table, so it contains unique information not derivable from the table."
}
Saved graph 1_3_2016: NO
0.8473906 לוח 3.3 ילדים במשפחות שבראשן הורה עצמאי^ וחלקם בכלל ילדי היישוב (יישובים בני 5,000 נפש ויותר) (מספרים ואחוזים) 2000-2015*
starting the API call
got answer
{
    "is_analysis": "NO",
    "explanation": "The graph shows a breakdown of children in single-parent families by immigrant status (new immigrants vs. established residents), which is not a variable present in the table. The table shows data about single-parent families across different localities, but does not segment this data by immigrant status. Therefore, the gra



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows recruitment rates to the IDF and reasons for non-recruitment by birth year and gender, while the table shows data about orphaned children of IDF casualties. These are completely different datasets measuring different phenomena."
}
```
Saved graph 9_3_2016: NO
✅ All graphs processed and saved.
✅ Loaded 10 tables and 6 graphs for chapter 04



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.9321071 לוח 4.1 גיל, מין וארץ מוצא של ילדים בפנימיות של המינהל לחינוך התיישבותי ועליית הנוער (מספרים ואחוזים) תש"ע (2009/10) - תשע"ה (2014/15)
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph is about 'family type' (סוג משפחה) of children, while the table contains data about age, gender, and country of origin. The graph introduces a new variable (family type) not present in the table data, making it non-redundant."
}
```
Saved graph 1_4_2016: NO
0.94673395 לוח 4.5 ילדים שהושמו בפנימיות לילדים בסיכון,  לפי מגזר חינוכי וסוג פנימייה (מספרים ואחוזים) תשע"ו (2015/16)
starting the API call
got answer
{
    "is_analysis": "YES",
    "explanation": "The graph shows children placed in residential facilities by welfare services, broken down by facility type. All of this information appears to be directly derivable from the table, which contains data on children in residential care facilities by educational sector and type of facility for the sam



got answer
```json
{
    "is_analysis": "YES",
    "explanation": "The graph appears to be visualizing the same data as the table - specifically showing children placed in foster care by the Ministry of Welfare by type of foster care (percentages) for 2015. Both the table and graph cover the same subject, same time period (2015), and use the same metric (percentages). The graph is simply a visual representation of the tabular data."
}
```
Saved graph 6_4_2016: YES
✅ All graphs processed and saved.
✅ Loaded 46 tables and 17 graphs for chapter 05



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.888209 לוח 5.6 מספר התלמידים לפי יישוב מגורים ושלב חינוך   (יישובים בני 10,000 נפש ויותר)  תשע"ה (2014/15)
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows percentages of students by age group and sector (as a percentage of population in that age group), while the table shows absolute numbers of students by locality and education level. The graph introduces different variables (age groups and sectors/demographics) not present in the table, and presents data as percentages of population rather than raw counts."
}
```
Saved graph 1_5_2016: NO
0.89513665 לוח  5.16 הסביבה הפיזית בבית הספר לפי מגזר וכיתה*** (אחוזים*) תשע"ו (2015/16)
starting the API call
got answer
```json
{
  "is_analysis": "NO",
  "explanation": "The graph appears to be showing 'positive feelings toward school' while the table shows data about 'physical environment in school'. These are different measures/variables. The table breaks down physical environment by sect



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows 'years of education' of military recruits by gender, which is not included in the table. The table only shows psychotechnical ratings of recruits, not education levels. These are different variables, and the gender breakdown is also not present in the table data."
}
```
Saved graph 17_5_2016: NO
✅ All graphs processed and saved.
✅ Loaded 16 tables and 20 graphs for chapter 06



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.9384608 לוח 6.3 אחוז התלמידים שבדרך כלל צופים בטלויזיה* ארבע שעות ביום או יותר, בזמנם הפנוי בימי השבוע לפי כיתה, מגזר ומין  2014
starting the API call
got answer
{
    "is_analysis": "NO",
    "explanation": "The graph shows data by year from 2002-2014, while the table only includes data for 2014. The graph includes historical time series information that is not available in the table, making it a non-redundant visualization."
}
Saved graph 1_6_2016: NO
0.90268844 לוח 6.5 השפעת השימוש באמצעים דיגיטליים* על בני נוער (12-17) והוריהם (אחוזים) יולי 2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data about digital communication usage for interpersonal purposes, while the table appears to focus on the effects of digital media usage. While they share the same demographic groups (12-17 year olds and their parents) and time period (July/Summer 2015), they measure different aspects of digital usage - effects versus interpersonal com



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows the percentage of drivers under 18 compared to the total population of that age group (their share among their peers), which is not data present in or derivable from the table. The table only shows the absolute number and percentage distribution of licensed drivers under 18 by gender and vehicle type, but doesn't provide information about what proportion of the entire under-18 population these drivers represent."
}
```
Saved graph 20_6_2016: NO
✅ All graphs processed and saved.
✅ Loaded 22 tables and 11 graphs for chapter 07



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.829588 לוח 7.12 ילדים המקבלים קצבת ילד נכה,  לפי יישוב* (מספרים ושיעור לאלף) 1995-2015**
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows the percentage of children with disabilities in the total child population for 2015, which requires data about the total child population. The table only contains data about children receiving disability benefits and their rates per thousand, but doesn't include the total child population figures needed to calculate percentages of all children. The graph therefore contains information not derivable from the table alone."
}
```
Saved graph 1_7_2016: NO
0.84129 לוח 7.12 ילדים המקבלים קצבת ילד נכה,  לפי יישוב* (מספרים ושיעור לאלף) 1995-2015**
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data broken down by severity of disability and gender, which are variables not present in the table. The table only shows children receiving disability all



got answer
```json
{
    "is_analysis": "YES",
    "explanation": "The graph shows percentages of mobility device requests by type for 2015, which can be directly calculated from the table that contains the raw numbers of requests by age group and device type for 2014-2015. The graph is simply a percentage transformation of the 2015 data from the table, aggregated across all age groups."
}
```
Saved graph 11_7_2016: YES
✅ All graphs processed and saved.
✅ Loaded 41 tables and 17 graphs for chapter 08



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.8568476 לוח 8.21 ילדים חולי סרטן* בגיל 0-14    לפי קבוצת אוכלוסייה ומין (מספרים ושיעור ל-100,000) 2009-2013 (ממוצע חמש-שנתי**)
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows life expectancy (תוחלת חיים) data from 1985-2015, while the table shows cancer rates among children aged 0-14 for 2009-2013. These are completely different measures (life expectancy vs. cancer rates) covering different time periods, making the graph not redundant."
}
```
Saved graph 1_8_2016: NO
0.9143027 לוח 8.18 תלמידים שאינם אוכלים ארוחת בוקר באמצע השבוע לפי מגזר וכיתה (אחוזים) 2014
starting the API call
got answer
{
    "is_analysis": "NO",
    "explanation": "The graph shows data by gender (מין) which is not included in the table, and also includes years from 2002-2014 while the table only shows data for 2014. The graph introduces variables and time periods not present in the table."
}
Saved graph 2_8_2016: NO
0.809012 לוח 8.13 תלמידים ששמרו על משקלם ב



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows infant mortality rates specifically for Muslim and Bedouin infants in the Negev, which is a demographic breakdown not included in the table. Additionally, the graph covers a wider time range (2001-2015) than the table (2005-2015). These differences make the graph contain unique information not derivable from the table."
}
```
Saved graph 17_8_2016: NO
✅ All graphs processed and saved.
✅ Loaded 17 tables and 13 graphs for chapter 09



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.8367841 לוח  9.1 גיל הילדים הפונים לחדר מיון  לפי סיבת פנייה שנרשמה בעת הפנייה (מספרים ואחוזים) 2014-2015
starting the API call
got answer
```json
{
    "is_analysis": "YES",
    "explanation": "The graph shows percentages of children's ER visits by reason for 2015, which can be directly derived from the table that contains data for 2014-2015 with both numbers and percentages of children's ER visits by age and reason. The graph simply presents a percentage-based visualization of a subset of the table data for the year 2015."
}
```
Saved graph 1_9_2016: YES
0.87228113 לוח 9.3 גיל הילדים (0-17) הפונים לחדרי מיון עקב סיבות חיצוניות, לפי יישוב המגורים (מספרים, אחוזים ושיעור לאלף בקבוצת הגיל ביישוב) 2013-2015 (נתון תלת-שנתי)
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data from 2015 only, while the table contains three-year data from 2013-2015. The graph title specifically indicates it represents 2015 data, whereas the table comb



got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The table shows accident data broken down by injured body part (numbers and percentages) while the graph shows accident data broken down by education level (percentages). The education level dimension is not present in the table, making the graph a unique visualization that provides information not derivable from the table."
}
```
Saved graph 13_9_2016: NO
✅ All graphs processed and saved.
✅ Loaded 0 tables and 0 graphs for chapter 10



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches: 0it [00:00, ?it/s]

Batches: 0it [00:00, ?it/s]



✅ All graphs processed and saved.
✅ Loaded 5 tables and 2 graphs for chapter 11



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.7910645 לוח 11.3 מספר תיקי החזקת ילדים והסדרי ראייה שנפתחו בבתי דין רבניים,  לפי מקום בית הדין  1996-2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The table shows child custody and visitation cases in rabbinical courts from 1996-2015, while the graph shows decisions by juvenile courts regarding declaring minors as 'in need' from April 2016. These are different legal proceedings, different court systems, different time periods, and different measures."
}
```
Saved graph 1_11_2016: NO
0.8013374 לוח 11.3 מספר תיקי החזקת ילדים והסדרי ראייה שנפתחו בבתי דין רבניים,  לפי מקום בית הדין  1996-2015
starting the API call




got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows data about requests for approving marriages of minors, while the table shows data about child custody and visitation arrangements cases in rabbinic courts. These are completely different topics with different measures, so the graph contains unique information not found in the table."
}
```
Saved graph 2_11_2016: NO
✅ All graphs processed and saved.
✅ Loaded 40 tables and 10 graphs for chapter 12



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.8482829 לוח 12.4 תיקים פליליים ותיקי ט.מ. של קטינים חשודים* לפי היישוב** שבו נפתח התיק (מספרים, אחוזים ושיעור לאלף בקבוצת הגיל 12-17 ביישוב) 2005-2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The table shows criminal cases by settlement for 2005-2015, while the graph shows the percentage of minor suspects out of all suspects from 2000-2015. The graph covers a longer time period (starting from 2000) than the table (starting from 2005) and appears to present a different metric (proportion of minors among all suspects) that is not directly derivable from the table data."
}
```
Saved graph 1_12_2016: NO
0.8083395 לוח  12.8 תיקים פליליים ותיקי ט.מ. שנפתחו לקטינים החשודים בעבירות אלימות גופנית נגד קטינים, לפי היישוב שבו בוצעה העבירה  (מספרים ושיעור לאלף בקבוצת הגיל 12-17 ביישוב)  2014-2015
starting the API call
got answer
{
    "is_analysis": "YES",
    "explanation": "The graph shows localities where the rate of criminal cases opened against 

Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.9493308 לוח 13.1 מספר תיקים פליליים* ותיקי טיפול מותנה (ט.מ.) שנפתחו בגין עבירות כנגד קטינים במשפחה ומחוץ למשפחה, לפי סוג תיק  2006-2015
starting the API call
got answer
```json
{
    "is_analysis": "YES",
    "explanation": "The graph shows the number of criminal cases and conditional treatment cases opened against children within and outside families from 2006-2015, which is exactly the same information contained in the table. The graph title directly matches the table header, indicating it's visualizing the same dataset without introducing any new variables or time periods."
}
```
Saved graph 1_13_2016: YES
0.8796124 לוח  13.22 תיקים פליליים שנפתחו לבגירים בחשד לביצוע עבירות כנגד קטינים במוסדות חינוך, לפי סוג עבירה (מספרים ואחוזים) 2006-2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The graph shows criminal cases AND indictments filed for sexual offenses against minors, while the table only shows criminal cases opened against adults su



got answer
```json
{
  "is_analysis": "NO",
  "explanation": "The graph shows data about crimes related to marriage age laws, which is not mentioned in the table. The table focuses on criminal cases against minors inside and outside families, while the graph specifically analyzes marriage age law violations. The graph also covers a wider time range (2000-2015) than the table (2006-2015)."
}
```
Saved graph 11_13_2016: NO
✅ All graphs processed and saved.
✅ Loaded 41 tables and 25 graphs for chapter 14



Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.79503995 לוח 14.4 יישוב המגורים של ילדים המוכרים למחלקות לשירותים חברתיים,   גילם וחלקם בכלל ילדי היישוב (מספרים ואחוזים) 2005-2016*
starting the API call
got answer
```json
{
    "is_analysis": "YES",
    "explanation": "The graph shows localities where 30% or more of children are known to social services departments in January 2016, which is a subset/filtered view of the table data. The table contains information about children known to social services across different localities from 2005-2016, including their percentages in each locality, so the graph simply highlights localities meeting the 30% threshold from the 2016 data in the table."
}
```
Saved graph 1_14_2016: YES
0.7733742 לוח 14.6 מספר הבקשות* לפטור מחובת דיווח** אודות פגיעה בילדים שטופלו, הוגשו ואושרו, לפי מחוז    1995-2015
starting the API call
got answer
```json
{
    "is_analysis": "NO",
    "explanation": "The table focuses on exemption requests from reporting child abuse across districts from 1995-2015, while the g

In [None]:
##cleaning up the table_summmaries.json from the tables first and second row +
# one row table name without /n or hebrew and english format mistakes
## localy running about 1s. ####
# (much needed for the redundancy check)

import os
import csv
import json

def update_summary_headers_second_row_multiple_louch(year_dir, summary_json_path):
    """
    Update summary_table.json headers using the second row of each CSV:
    - Keep all cells containing 'לוח' aside (in order)
    - Check duplicates among other cells (ignoring empty strings)
    - Skip if more than one distinct non-empty text exists outside 'לוח'
    - Build header: all 'לוח' cells concatenated + duplicate text
    - Remove newlines so header is a single line
    """
    # Load the JSON
    with open(summary_json_path, "r", encoding="utf-8") as f:
        summary_data = json.load(f)

    # Walk through all CSVs
    for root, _, files in os.walk(year_dir):
        for file in files:
            if not file.endswith(".csv"):
                continue
            csv_path = os.path.join(root, file)

            # Extract JSON key from filename (remove .csv)
            json_key = file[:-4]
            if json_key not in summary_data:
                print(f"JSON key not found for {file}, skipping.")
                continue

            # Read the second row
            with open(csv_path, "r", encoding="utf-8-sig") as f:
                reader = csv.reader(f)
                try:
                    next(reader)  # Skip first row
                    second_row = next(reader)  # Read second row
                except StopIteration:
                    print(f"CSV has less than 2 rows: {file}, skipping.")
                    continue

            # Separate 'לוח' cells and other cells (ignore empty strings)
            louch_cells = [cell for cell in second_row if "לוח" in cell]
            other_cells = [cell for cell in second_row if cell not in louch_cells and cell != ""]

            if not louch_cells:
                print(f"No 'לוח' cell found in second row, skipping: {file}")
                continue

            # Check duplicates among other_cells
            unique_texts = set(other_cells)
            if len(unique_texts) > 1:
                print(f"More than one distinct non-empty text in second row (excluding 'לוח'), skipping: {file}")
                continue

            # Determine duplicate text (empty if nothing else)
            duplicate_text = other_cells[0] if other_cells else ""
            louch_cell = louch_cells[0]  if louch_cells else ""

            # Build cleaned header: concatenate 'לוח' + duplicate, remove newlines
            cleaned_header = " ".join([louch_cell] + ([duplicate_text] if duplicate_text else []))
            cleaned_header = cleaned_header.replace("\n", " ").replace("\r", " ").strip()

            # Update JSON
            summary_data[json_key] = cleaned_header

    # Save updated JSON
    with open(summary_json_path, "w", encoding="utf-8") as f:
        json.dump(summary_data, f, ensure_ascii=False, indent=4)

    print("Summary JSON updated successfully.")


# # Example usage
for year in range(2001,2017):
   update_summary_headers_second_row_multiple_louch(f"/content/tables_2001-2016/{year}", "/content/tables_summary_all_cleaned.json")


In [1]:
## extracting releavant graphs into a new directory - convertion prep ###

import os
import json
import shutil

# Paths
graphs_dir = "/content/graphs"
redundant_json_path = "/content/redundant_graphs.json"
graphs_summary_path = "/content/graphs_summary.json"
relevant_dir = "/content/relevant_graphs"
relevant_summary_path = "/content/relevant_graphs_summary.json"

# Load JSONs
with open(redundant_json_path, "r", encoding="utf-8") as f:
    redundant_map = json.load(f)

with open(graphs_summary_path, "r", encoding="utf-8") as f:
    graphs_summary = json.load(f)

# Ensure relevant directory exists
os.makedirs(relevant_dir, exist_ok=True)

# ---------- Helper ----------
def filename_to_key(fname):
    """
    Convert filename '2023_03__04.png' -> '4_3_2023' to match JSON keys.
    Handles double underscores before the serial number and strips leading zeros.
    """
    base = os.path.splitext(fname)[0]
    parts = [p for p in base.split("_") if p != ""]  # remove empty strings
    if len(parts) >= 3:
        year, chapter, serial = parts[:3]
        chapter = str(int(chapter))  # remove leading zeros
        serial = str(int(serial))    # remove leading zeros
        return f"{serial}_{chapter}_{year}"
    return base  # fallback

# ---------- Filter relevant graphs ----------
relevant_summary = {}

for year in os.listdir(graphs_dir):
    year_path = os.path.join(graphs_dir, year)
    if not os.path.isdir(year_path):
        continue

    for chapter in os.listdir(year_path):
        chapter_path = os.path.join(year_path, chapter)
        if not os.path.isdir(chapter_path):
            continue

        for fname in os.listdir(chapter_path):
            if not fname.lower().endswith((".png", ".jpg", ".jpeg", ".pdf")):
                continue

            key = filename_to_key(fname)

            if redundant_map.get(key, "NO") == "NO":  # only keep relevant
                # Copy to relevant directory
                new_year_dir = os.path.join(relevant_dir, year)
                new_chap_dir = os.path.join(new_year_dir, chapter)
                os.makedirs(new_chap_dir, exist_ok=True)
                shutil.copy2(os.path.join(chapter_path, fname), os.path.join(new_chap_dir, fname))

                # Add to relevant summary if exists
                if key in graphs_summary:
                    relevant_summary[key] = graphs_summary[key]

# Save relevant summary JSON
with open(relevant_summary_path, "w", encoding="utf-8") as f:
    json.dump(relevant_summary, f, ensure_ascii=False, indent=2)

print("✅ Filtering complete")
print(f"Relevant graphs saved under: {relevant_dir}")
print(f"Relevant summary saved as: {relevant_summary_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/redundant_graphs.json'

In [None]:
##validation to decesion - randomlly chosing cases (default 40) to check by hand ###

import json
import random

def sample_yes_cases(json_path, out_path="/content/yes_samples_validation.txt", n=40, seed=42):
    # Load JSON
    with open(json_path, "r") as f:
        data = json.load(f)

    # Collect only "YES" cases
    yes_items = [(k, v) for k, v in data.items() if v.get("is_analysis") == "YES"]

    # Sample randomly (uniform)
    random.seed(seed)  # reproducibility
    sample = random.sample(yes_items, min(n, len(yes_items)))

    # Write to file in readable format
    with open(out_path, "w", encoding="utf-8") as f:
        for key, entry in sample:
            f.write(f"Key: {key}\n")
            f.write(f"Table: {entry.get('table','')}\n")
            f.write(f"Graph: {entry.get('graph','')}\n")
            f.write(f"Similarity: {entry.get('similarity','')}\n")
            f.write(f"Explanation: {entry.get('explanation','')}\n")
            f.write("="*80 + "\n")

    print(f"✅ Saved {len(sample)} YES cases to {out_path}")

# Example usage
sample_yes_cases("/content/redundant_graphs.json")


In [None]:
## validation of extraction
## ran locally in my computer ###

#!/usr/bin/env python3
# generate_checks.py
import json
import random
from pathlib import Path
from collections import OrderedDict
import csv

# External libs
from docx import Document
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm

# ---------- CONFIG ----------
JSON_FILENAME = "graphs_summary.json"   # file expected in same folder
OUT_DIR = Path("checks_output")
OUT_DIR.mkdir(parents=True, exist_ok=True)
PERSONS = ["shira", "suf", "elisheva", "shaul"]
TABLES_PER_PERSON = 10
PAGES_PER_PERSON = 10
TOTAL_CHECKS_PER_PERSON = TABLES_PER_PERSON + PAGES_PER_PERSON
YEARS_ALLOWED = list(range(2001, 2017)) + [2019, 2021, 2022, 2023]
SEED = 42  # set to None for nondeterministic sampling
# ----------------------------

random.seed(SEED)

# ---------- load JSON ----------
json_path = Path(JSON_FILENAME)
if not json_path.exists():
    raise FileNotFoundError(f"JSON file not found: {json_path.resolve()}\nPut your graphs_summary.json next to this script")

with json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

# Normalize items -> list of (key, description)
all_items = []
for k, v in data.items():
    if isinstance(v, str):
        desc = v.replace("\n", " ").strip()  # <--- preprocessing here
    elif isinstance(v, dict):
        desc = (
            v.get("table")
            or v.get("graph")
            or v.get("explanation")
            or json.dumps(v, ensure_ascii=False)
        )
        desc = desc.replace("\n", " ").strip()  # <--- preprocessing here too
    else:
        desc = str(v).replace("\n", " ").strip()  # <--- also here
    all_items.append((k, desc))

if not all_items:
    raise RuntimeError("No items found in JSON file.")

# ---------- helper ----------
def parse_key(key):
    parts = key.split("_")
    try:
        serial = int(parts[0]); chapter = int(parts[1]); year = int(parts[2])
        return serial, chapter, year
    except Exception:
        return None, None, None

def sample_unique(n, pool):
    """Uniform sample without replacement if possible else with replacement."""
    if n <= len(pool):
        return random.sample(pool, n)
    else:
        # not enough unique items: sample all then fill with random choices
        out = pool.copy()
        while len(out) < n:
            out.append(random.choice(pool))
        return out

# ---------- sampling per person ----------
persons_data = {}
for person in PERSONS:
    tables = sample_unique(TABLES_PER_PERSON, all_items)
    page_checks = []
    for _ in range(PAGES_PER_PERSON):
        page = random.randint(5, 30)
        chapter = random.randint(1, 15)
        year = random.choice(YEARS_ALLOWED)
        page_checks.append({"page": page, "chapter": chapter, "year": year})
    persons_data[person] = {"tables": tables, "page_checks": page_checks}

# ---------- save CSV of sampled keys ----------
csv_path = OUT_DIR / "sampled_keys.csv"
with csv_path.open("w", encoding="utf-8", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["person", "kind", "index", "key", "chapter", "year", "short_description"])
    for person, pdata in persons_data.items():
        for i, (key, desc) in enumerate(pdata["tables"], start=1):
            _, chapter, year = parse_key(key)
            short = desc.replace("\n", " ")[:250]
            writer.writerow([person, "table", i, key, chapter, year, short])
        for i, chk in enumerate(pdata["page_checks"], start=1):
            writer.writerow([person, "page_check", i, "", chk["chapter"], chk["year"], f"page {chk['page']}"])

print(f"Saved CSV of sampled keys to {csv_path}")

# ---------- create DOCX per person ----------
for person, pdata in persons_data.items():
    doc = Document()
    doc.add_heading(f"Checks for {person.capitalize()}", level=1)
    doc.add_paragraph(f"Total checks: {TOTAL_CHECKS_PER_PERSON} (10 table checks + 10 page checks)")
    doc.add_paragraph("Instructions: For each table entry, go to the indicated chapter/year and confirm whether the table exists. Mark Found / Not Found and add notes.\n")
    doc.add_heading("Part A — Table entries (from our summary)", level=2)
    for i, (key, desc) in enumerate(pdata["tables"], start=1):
        serial, chapter, year = parse_key(key)
        doc.add_paragraph(f"{i}. Key: {key}  (Chapter: {chapter}, Year: {year})")
        short_desc = desc if len(desc) < 600 else desc[:600] + " ..."
        doc.add_paragraph(short_desc)
        doc.add_paragraph("[ ] Found   [ ] Not Found\n")
    doc.add_heading("Part B — Page checks", level=2)
    for i, chk in enumerate(pdata["page_checks"], start=1):
        doc.add_paragraph(f"{i}. Page: {chk['page']}   Chapter: {chk['chapter']}   Year: {chk['year']}")
        doc.add_paragraph("Was there a table on this page? [ ] Yes  [ ] No    If yes: which table key? ______   Exists in our data? [ ] Yes [ ] No \n")
    doc_path = OUT_DIR / f"{person}_checks.docx"
    doc.save(doc_path)
    print(f"Wrote {doc_path}")

# ---------- create combined PDF ----------
pdf_path = OUT_DIR / "checks_all_people.pdf"
c = canvas.Canvas(str(pdf_path), pagesize=A4)
width, height = A4
margin = 18 * mm
x = margin
y_start = height - margin
line_h = 12
font_name = "Helvetica"

for person, pdata in persons_data.items():
    y = y_start
    c.setFont(font_name, 14)
    c.drawString(x, y, f"Checks for {person.capitalize()}")
    y -= 18
    c.setFont(font_name, 10)
    c.drawString(x, y, f"Total checks: {TOTAL_CHECKS_PER_PERSON} (10 table checks + 10 page checks)")
    y -= 14
    c.drawString(x, y, "Part A — Table entries (tick Found / Not Found).")
    y -= 14

    for i, (key, desc) in enumerate(pdata["tables"], start=1):
        serial, chapter, year = parse_key(key)
        c.setFont(font_name, 9)
        c.drawString(x, y, f"{i}. Key: {key}  (Ch {chapter}, Yr {year})")
        # checkboxes
        bx = width - margin - 120
        c.rect(bx, y-4, 8, 8)
        c.drawString(bx+12, y, "Found")
        c.rect(bx+56, y-4, 8, 8)
        c.drawString(bx+68, y, "Not Found")
        y -= line_h
        # wrapped description
        desc_short = desc.replace("\n", " ")
        max_chars = 120
        words = desc_short.split()
        cur = ""
        for w in words:
            if len(cur) + len(w) + 1 <= max_chars:
                cur += (" " + w) if cur else w
            else:
                c.drawString(x+6, y, cur)
                y -= 10
                cur = w
                if y < margin + 60:
                    c.showPage()
                    y = y_start
        if cur:
            c.drawString(x+6, y, cur)
            y -= 14
        if y < margin + 80:
            c.showPage()
            y = y_start

    c.setFont(font_name, 11)
    c.drawString(x, y, "Part B — Page checks")
    y -= 14
    c.setFont(font_name, 10)
    for i, chk in enumerate(pdata["page_checks"], start=1):
        c.drawString(x, y, f"{i}. Page {chk['page']}   Chapter {chk['chapter']}   Year {chk['year']}")
        bx = width - margin - 170
        c.rect(bx, y-4, 8, 8)
        c.drawString(bx+12, y, "Table present?")
        c.rect(bx+90, y-4, 8, 8)
        c.drawString(bx+102, y, "Exists in data?")
        y -= line_h
        c.drawString(x+10, y, "If yes: which table key? ___________ ")
        y -= 18
        if y < margin + 80:
            c.showPage()
            y = y_start

    c.showPage()

c.save()
print(f"Wrote combined PDF: {pdf_path}")

print("\nAll outputs written to:", OUT_DIR.resolve())


In [20]:
# statistics of extracted YES/NO (arround 20% redundent) ###

import json
from collections import defaultdict
import pandas as pd

def analyze_redundant_graphs(json_path="/content/redundant_graphs.json"):
    # Load JSON
    with open(json_path, "r") as f:
        data = json.load(f)

    # Counting structures
    yearly_counts = defaultdict(lambda: {"YES": 0, "NO": 0})
    chapter_counts = defaultdict(lambda: {"YES": 0, "NO": 0})
    overall_counts = {"YES": 0, "NO": 0}

    # Parse keys
    for key, value in data.items():
        try:
            serial, chapter, year = map(int, key.split("_"))
        except ValueError:
            continue  # skip malformed keys

        # Update counts
        yearly_counts[year][value] += 1
        chapter_counts[(year, chapter)][value] += 1
        overall_counts[value] += 1

    # Yearly stats
    year_df = pd.DataFrame.from_dict(yearly_counts, orient="index").sort_index()
    year_df["Total"] = year_df["YES"] + year_df["NO"]
    year_df["%YES"] = (year_df["YES"] / year_df["Total"] * 100).round(2)
    year_df["%NO"] = (year_df["NO"] / year_df["Total"] * 100).round(2)

    # Chapter stats
    chapter_df = pd.DataFrame.from_dict(chapter_counts, orient="index")
    chapter_df.index = pd.MultiIndex.from_tuples(chapter_df.index, names=["Year", "Chapter"])
    chapter_df = chapter_df.sort_index()
    chapter_df["Total"] = chapter_df["YES"] + chapter_df["NO"]
    chapter_df["%YES"] = (chapter_df["YES"] / chapter_df["Total"] * 100).round(2)
    chapter_df["%NO"] = (chapter_df["NO"] / chapter_df["Total"] * 100).round(2)

    # Overall stats with % too
    overall_total = overall_counts["YES"] + overall_counts["NO"]
    overall_percent = {
        "YES": round(overall_counts["YES"] / overall_total * 100, 2),
        "NO": round(overall_counts["NO"] / overall_total * 100, 2),
    }

    # Print summaries
    print("=== Overall Counts ===")
    print(overall_counts)
    print("=== Overall % ===")
    print(overall_percent)
    print("\n=== Yearly Stats ===")
    display(year_df)
    print("\n=== Chapter Stats (per year, per chapter) ===")
    display(chapter_df)

    return overall_counts, year_df, chapter_df

analyze_redundant_graphs()

=== Overall Counts ===
{'YES': 369, 'NO': 1421}
=== Overall % ===
{'YES': 20.61, 'NO': 79.39}

=== Yearly Stats ===


Unnamed: 0,YES,NO,Total,%YES,%NO
2001,24,73,97,24.74,75.26
2002,25,87,112,22.32,77.68
2003,29,78,107,27.1,72.9
2004,30,75,105,28.57,71.43
2005,21,72,93,22.58,77.42
2006,20,89,109,18.35,81.65
2007,18,92,110,16.36,83.64
2008,19,90,109,17.43,82.57
2009,19,116,135,14.07,85.93
2010,30,115,145,20.69,79.31



=== Chapter Stats (per year, per chapter) ===


Unnamed: 0_level_0,Unnamed: 1_level_0,YES,NO,Total,%YES,%NO
Year,Chapter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001,1,0,2,2,0.00,100.00
2001,2,3,3,6,50.00,50.00
2001,3,1,11,12,8.33,91.67
2001,4,1,0,1,100.00,0.00
2001,5,3,6,9,33.33,66.67
...,...,...,...,...,...,...
2015,10,2,15,17,11.76,88.24
2015,11,0,2,2,0.00,100.00
2015,12,1,9,10,10.00,90.00
2015,13,1,15,16,6.25,93.75


({'YES': 369, 'NO': 1421},
       YES   NO  Total   %YES    %NO
 2001   24   73     97  24.74  75.26
 2002   25   87    112  22.32  77.68
 2003   29   78    107  27.10  72.90
 2004   30   75    105  28.57  71.43
 2005   21   72     93  22.58  77.42
 2006   20   89    109  18.35  81.65
 2007   18   92    110  16.36  83.64
 2008   19   90    109  17.43  82.57
 2009   19  116    135  14.07  85.93
 2010   30  115    145  20.69  79.31
 2011   26  120    146  17.81  82.19
 2012   21   97    118  17.80  82.20
 2013   18   96    114  15.79  84.21
 2014   31  101    132  23.48  76.52
 2015   38  120    158  24.05  75.95,
               YES  NO  Total    %YES     %NO
 Year Chapter                                
 2001 1          0   2      2    0.00  100.00
      2          3   3      6   50.00   50.00
      3          1  11     12    8.33   91.67
      4          1   0      1  100.00    0.00
      5          3   6      9   33.33   66.67
 ...           ...  ..    ...     ...     ...
 2015 10    