In [None]:
!{sys.executable} -m pip install nltk spacy lxml
!{sys.executable} -m spacy download en_core_web_sm

In [None]:
import sys

import os
import pandas as pd
from bs4 import BeautifulSoup
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
import locale
from collections import defaultdict
import datetime

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
import datetime as dt

# NLTK imports
import nltk

nltk.data.path.append('../nltk_data/')
nltk.download('stopwords')
import string
from nltk import collocations
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk import RegexpParser
from nltk.tree import *

# spaCy imports
import spacy
from spacy.symbols import nsubj, VERB

from time import sleep
from lxml import etree

In [None]:
def lemmatize_verb(verb):
    # Lemmatize the verb using spaCy
    doc = nlp(verb)
    return doc[0].lemma_

In [None]:
with open('pal_affiliations.json', 'r') as file:
    PALESTINE_MEMBER_AFFILIATIONS = json.load(file)

with open('israel_affiliations.json', 'r') as file:
    ISRAEL_MEMBER_AFFILIATIONS = json.load(file)

In [None]:
nlp = spacy.load('en_core_web_sm')


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Choosing a dataset
sample_size = 1

# Defining the dataset path
results_prefix = "./results"

input_files = os.listdir(results_prefix + '/')

In [None]:
def follow_compound(dep_idx, dependencies_by_governor):
    # Follow compound chain and return descriptors of a dependency
    visited = set([dep_idx])
    current_idx = dep_idx
    descriptors = set()

    found = True
    while found:
        found = False
        for dep in dependencies_by_governor[current_idx]:
            if dep["dep"] == "compound:prt" or dep["dep"] == "compound":
                current_idx = dep["dependent"]
                descriptors.add(dep["dependentGloss"])
                if current_idx not in visited:
                    found = True
                    visited.add(current_idx)
                break
    return descriptors

In [None]:
def investigate_subject(subject, dependencies_by_governor):
    # Investigate all dependencies related to a subject to find as many descriptors as possible
    # Present in a tiered list based on "closeness" to subject

    verbose = False

    subject_descriptors = [set(), set(), set(), 1]
    subject_descriptors[0].add(subject[1])
    # AMOD takes precedence over NMOD? takes precedence over ACL
    for dep in dependencies_by_governor[subj_idx]:
        if (dep["dep"] == "amod"):
            subject_descriptors[0].add(dep["dependentGloss"])
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "acl" or dep["dep"] == "acl:relcl"):
            subject_descriptors[1].add(dep["dependentGloss"])
            # INVESTIGATE THE SUBJECT AND OBJECT OF DESCRIPTIVE CLAUSE
            if verbose:
                print(dep["dep"], dep["dependentGloss"])
                print(dependencies_by_governor[dep["dependent"]])
                print()
            for double_dep in dependencies_by_governor[dep["dependent"]]:
                # check nsubj and check obj
                if (double_dep["dep"] == "nsubj" or double_dep["dep"] == "nsubj:pass" or double_dep[
                    "dep"] == "nsubj:outer"
                        or double_dep["dep"] == "csubj" or double_dep["dep"] == "csubj:pass" or double_dep[
                            "dep"] == "csubj:outer"
                        or double_dep["dep"] == "obj"):
                    subject_descriptors[2].add(double_dep["dependentGloss"])

        if (dep["dep"] == "nmod" or dep["dep"] == "nmod:npmod" or dep["dep"] == "nmod:tmod" or dep[
            "dep"] == "nmod:poss"):
            subject_descriptors[1].add(dep["dependentGloss"])
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "advmod"):
            subject_descriptors[1].add(dep["dependentGloss"])  # TODO: CHECK THIS
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "appos"):
            subject_descriptors[0].add(dep["dependentGloss"])  # TODO:CHECK THIS
            # Look for adjectives for appos also
            # CHECK COMPOUD AND AMOD
            to_add = follow_compound(dep["dependent"], dependencies_by_governor)
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n", to_add)
            for thing in to_add:
                subject_descriptors[0].add(thing)
            for double_dep in dependencies_by_governor[dep["dependent"]]:
                if (double_dep["dep"] == "amod"):
                    subject_descriptors[2].add(double_dep["dependentGloss"])

        #         if (dep["dep"] == "ccomp"):
        #             print(dependencies_by_governor[dep["dependent"]])
        #             for double_dep in dependencies_by_governor[dep["dependent"]]:
        #                 if (double_dep["dep"] == "nsubj" or double_dep["dep"] == "nsubj:pass"
        #                      or double_dep["dep"] == "csubj" or double_dep["dep"] == "csubj:pass"):
        #                     subject_descriptors[2].add(double_dep["dependentGloss"])

        if (dep["dep"] == "nummod"):
            try:
                subject_descriptors[3] = locale.atoi(dep["dependentGloss"])
            except:
                subject_descriptors[3] = dep["dependentGloss"]

    return subject_descriptors

In [None]:
def extract_sentences(sentences):
    # extract all sentences in an article
    sentences_text = [None] * len(sentences)
    for sentence in sentences:
        sentence_index = sentence["index"]

        tokens = sentence["tokens"]
        sentence_text = ""
        for token in tokens:
            sentence_text += token["before"] + token["word"] + token["after"]

        sentences_text[sentence_index] = sentence_text
    return sentences_text

In [None]:
from IPython.display import clear_output
def refresh_screen():
    clear_output()
    sleep(0.02)

In [None]:
# load data for annotation
df = pd.read_csv('./data/summary_20240421_articles.csv')
df['date'] = pd.to_datetime(df['date'], format='mixed')

In [None]:
root_directory = './'

# Define variables to track counts
both_present_count = 0
antisemitic_only_count = 0
islamophobic_only_count = 0

# Loop through each article
for index, row in df.iterrows():

    # Initialize variables to track presence of keywords
    antisemitic_present = False
    islamophobic_present = False

    results_file = row['results_file']
    article_file = row['article_file']

    filename = root_directory + results_file

    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()


    sentences = data["sentences"]
    text_all_sentences = extract_sentences(sentences)


    # Process each sentence for keyword presence
    for sentence in text_all_sentences:
        sentence_text = sentence
        # Check for presence of keywords
        if "antisemitic" in sentence_text.lower() or "antisemitism" in sentence_text.lower() or "anti-jew" in sentence_text.lower() or "anti-semitic" in sentence_text.lower() or "anti-semitism" in sentence_text.lower():
            antisemitic_present = True
        if "islamophobic" in sentence_text.lower() or "islamophobia" in sentence_text.lower() or "anti-muslim" in sentence_text.lower() or "anti-arab" in sentence_text.lower():
            islamophobic_present = True

    # Update counts based on keyword presence
    if antisemitic_present and islamophobic_present:
        both_present_count += 1
    elif antisemitic_present:
        antisemitic_only_count += 1
    elif islamophobic_present:
        islamophobic_only_count += 1

# Print or store the counts as needed
print("Articles containing both 'antisemitic'/'antisemitism' and 'Islamophobic'/'Islamophobia':", both_present_count)
print("Articles containing 'antisemitic'/'antisemitism' but not 'Islamophobic'/'Islamophobia':", antisemitic_only_count)
print("Articles containing 'Islamophobic'/'Islamophobia' but not 'antisemitic'/'antisemitism':", islamophobic_only_count)

In [None]:
def count_passive_objects(sentences, counts):
    for sentence in sentences:
        tokens = sentence["tokens"]
        dependencies = sentence["basicDependencies"]

        is_passive = False
        has_agent = False
        root_verb = None

        # Check if the sentence is passive and has no agent
        for dep in dependencies:
            if dep["dep"] == "nsubj:pass" or dep["dep"] == "csubj:pass":
                is_passive = True
            if (dep["dep"] == "nmod" or dep["dep"] == "obl:agent"):
                has_agent = True
            if dep["dep"] == "ROOT":  # Extract the root verb
                root_verb = dep.get("dependentGloss")

        # If the sentence is passive and has no agent, count occurrences of the target words as the subject
        if is_passive:
            for dep in dependencies:
                if dep["dep"] == "nsubj":
                    subject_word = dep.get("dependentGloss")
                    if subject_word in ISRAEL_MEMBER_AFFILIATIONS:
                        counts['israel']['passive_no_agent' if not has_agent else 'passive_with_agent'] += 1
                    elif subject_word in PALESTINE_MEMBER_AFFILIATIONS:
                        counts['palestine']['passive_no_agent' if not has_agent else 'passive_with_agent'] += 1

        # Check if the sentence is active and count occurrences of the target words as the subject
        else:
            for dep in dependencies:
                if dep["dep"] == "nsubj":
                    subject_word = dep.get("dependentGloss")
                    if subject_word in ISRAEL_MEMBER_AFFILIATIONS:
                        counts['israel']['active'] += 1
                    elif subject_word in PALESTINE_MEMBER_AFFILIATIONS:
                        counts['palestine']['active'] += 1

    return counts

In [None]:
root_directory = './'

counts = defaultdict(lambda: defaultdict(int))

for index, row in df.iterrows():
    results_file = row['results_file']
    article_file = row['article_file']


    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    title, date = row['title'], row['date'].date().strftime('%Y-%m-%d')

    # Extract NLP results
    sentences = data["sentences"]
    counts = count_passive_objects(sentences, counts)

with open('patient_agent_counts.json', 'w') as json_file:
    json.dump(counts, json_file, indent=4)

In [None]:
counts

In [None]:
counts['israel']['passive_no_agent']/(counts['israel']['passive_no_agent'] + counts['israel']['passive_with_agent'] + counts['israel']['active'])




In [None]:
counts['palestine']['passive_no_agent']/(counts['palestine']['passive_no_agent'] + counts['palestine']['passive_with_agent'] + counts['palestine']['active'])


In [None]:
def get_active_verbs_with_affiliation_subjects(sentences, verb_counts):
    for sentence in sentences:
        dependencies = sentence["basicDependencies"]
        is_active = False
        subject_word = None

        # Check if the sentence is active and has a subject from either affiliation
        for dep in dependencies:
            if dep["dep"] == "nsubj" or dep["dep"] == "csubj":  # Active subject dependencies
                subject_word = dep.get("dependentGloss")
                if subject_word in ISRAEL_MEMBER_AFFILIATIONS:
                    category = "israel"
                    is_active = True
                    break
                elif subject_word in PALESTINE_MEMBER_AFFILIATIONS:
                    category = "palestine"
                    is_active = True
                    break

        # If the sentence is active with a subject from either affiliation, collect the verb
        if is_active:
            for dep in dependencies:
                if dep["dep"] == "ROOT":  # Find the root verb of the sentence
                    verb = dep.get("dependentGloss")
                    verb_lemma = lemmatize_verb(verb)
                    verb_counts[category][verb_lemma] = verb_counts[category].get(verb_lemma, 0) + 1

    return verb_counts


In [None]:
def get_passive_verbs_with_affiliation_subjects(sentences, verb_counts):
    for sentence in sentences:
        dependencies = sentence["basicDependencies"]
        is_passive = False
        subject_word = None

        # Check if the sentence is passive and has a subject from either affiliation
        for dep in dependencies:
            if dep["dep"] == "nsubj:pass" or dep["dep"] == "csubj:pass":  # Passive subject dependencies
                subject_word = dep.get("dependentGloss")
                if subject_word in ISRAEL_MEMBER_AFFILIATIONS:
                    category = "israel"
                    is_passive = True
                    break
                elif subject_word in PALESTINE_MEMBER_AFFILIATIONS:
                    category = "palestine"
                    is_passive = True
                    break

        # If the sentence is active with a subject from either affiliation, collect the verb
        if is_passive:
            for dep in dependencies:
                if dep["dep"] == "ROOT":  # Find the root verb of the sentence
                    verb = dep.get("dependentGloss")
                    verb_lemma = lemmatize_verb(verb)
                    verb_counts[category][verb_lemma] = verb_counts[category].get(verb_lemma, 0) + 1

    return verb_counts

In [None]:
root_directory = './'

verb_counts = defaultdict(lambda: defaultdict(int))

for index, row in df.iterrows():
    results_file = row['results_file']
    article_file = row['article_file']


    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    title, date = row['title'], row['date'].date().strftime('%Y-%m-%d')

    # Extract NLP results
    sentences = data["sentences"]
    verb_counts = get_passive_verbs_with_affiliation_subjects(sentences, verb_counts)

with open('passive_verb_counts.json', 'w') as json_file:
    json.dump(verb_counts, json_file, indent=4)

In [None]:
root_directory = './'

verb_counts = defaultdict(lambda: defaultdict(int))

for index, row in df.iterrows():
    results_file = row['results_file']
    article_file = row['article_file']


    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    title, date = row['title'], row['date'].date().strftime('%Y-%m-%d')

    # Extract NLP results
    sentences = data["sentences"]
    verb_counts = get_active_verbs_with_affiliation_subjects(sentences, verb_counts)

with open('active_verb_counts.json', 'w') as json_file:
    json.dump(verb_counts, json_file, indent=4)

In [None]:
verb_counts

In [None]:
root_directory = './'

verb_counts = defaultdict(lambda: defaultdict(int))

for index, row in df.iterrows():
    results_file = row['results_file']
    article_file = row['article_file']


    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    title, date = row['title'], row['date'].date().strftime('%Y-%m-%d')

    # Extract NLP results
    sentences = data["sentences"]
    verb_counts = get_active_verbs_with_affiliation_subjects(sentences, verb_counts)

with open('active_verb_counts.json', 'w') as json_file:
    json.dump(counts, json_file, indent=4)

In [None]:
pal_affiliates = PALESTINE_MEMBER_AFFILIATIONS.copy()
pal_affiliates.remove("Hamas")

PALESTINE_IDENTIFIERS = ["Palestine", "Palestinian", "Palestinians", "Gaza", "Gazan", "Gazans"]
ISRAEL_IDENTIFIERS = ["Israel", "Israeli", "Israelis"]

In [None]:
#mentions of israel v palestine in articles over time- output a csv
# for each article -
# make dict w each week, 0 counts of israel, pal, gaza, hamas
# categorize into week
# find counts, for each token increment category as appropriate
df = pd.read_csv('./data/summary_20240421_articles.csv')
df['date'] = pd.to_datetime(df['date'], format='mixed')
counts_dict = defaultdict(lambda: defaultdict(int))

root_directory = './'

for index, row in df.iterrows():


    results_file = row['results_file']
    article_file = row['article_file']

    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    week_number = row['date'].date().isocalendar()[1]

    # Extract NLP results
    sentences = data["sentences"]
    text_all_sentences = extract_sentences(sentences)

    # for sentence in text_all_sentences:
    #     if any(word in sentence for word in PALESTINE_IDENTIFIERS):
    #         counts_dict[week_number]["palestine"] += 1
    #     if any(word in sentence for word in ISRAEL_IDENTIFIERS):
    #         counts_dict[week_number]["israel"] += 1
    #     if any(word in sentence for word in ["Hamas"]):
    #         counts_dict[week_number]["hamas"] += 1

    for sentence in text_all_sentences:
        if any(word in sentence for word in pal_affiliates):
            counts_dict[week_number]["palestine"] += 1
        if any(word in sentence for word in ISRAEL_MEMBER_AFFILIATIONS):
            counts_dict[week_number]["israel"] += 1

df = pd.DataFrame(counts_dict).transpose()
df.fillna(0, inplace=True)
df.to_csv(root_directory + 'keyword_counts_incl_affiliates.csv', index=False)
print(counts_dict)

In [None]:
!{sys.executable} -m pip install openai
import openai