In [None]:
import sys

import os
import pandas as pd
from bs4 import BeautifulSoup
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
import locale
from collections import defaultdict
import datetime
import numpy as np

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
import datetime as dt

# NLTK imports
import nltk
from collections import Counter

nltk.data.path.append('../nltk_data/')
nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import string
from nltk import collocations
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk import RegexpParser
from nltk.tree import *

# spaCy imports
import spacy
from spacy.symbols import nsubj, VERB

from time import sleep
from lxml import etree
!{sys.executable} -m pip install wordcloud tabulate

In [None]:
nltk.download('stopwords')

# Get the English stopwords from NLTK
stopwords = set(stopwords.words('english'))

In [None]:
with open('pal_affiliations.json', 'r') as file:
    PALESTINE_MEMBER_AFFILIATIONS = json.load(file)

with open('israel_affiliations.json', 'r') as file:
    ISRAEL_MEMBER_AFFILIATIONS = json.load(file)

In [None]:
df = pd.read_csv('./data/summary_20240421_articles.csv')
df['date'] = pd.to_datetime(df['date'], format='mixed')
counts_dict = defaultdict(lambda: defaultdict(int))

root_directory = './'

In [None]:
with open('active_verb_counts.json', 'r') as file:
    active_verb_counts = json.load(file)

with open('passive_verb_counts.json', 'r') as file:
    passive_verb_counts = json.load(file)

with open('patient_agent_counts.json', 'r') as file:
    active_passive_counts = json.load(file)

In [None]:
def cooccurences(context_window, israel_co_occurrences, palestine_co_occurrences):
    for sentence in sentences:
        tokens = sentence["tokens"]
        dependencies = sentence["basicDependencies"]

        # Extract words and their positions
        words = [token["word"] for token in tokens]
        word_positions = {word: idx for idx, word in enumerate(words)}

        # Find co-occurrences with words from the "israel" and "palestine" categories
        for token in tokens:
            word = token["word"]
            lemma = token["lemma"]

            # Check if the word is in the "israel" or "palestine" categories
            if word in ISRAEL_MEMBER_AFFILIATIONS:
                category = "israel"
            elif word in PALESTINE_MEMBER_AFFILIATIONS:
                category = "palestine"
            else:
                continue

            # Define the context window around the word
            start_idx = max(0, word_positions[word] - context_window)
            end_idx = min(len(words), word_positions[word] + context_window + 1)

            # Extract the context words
            context_words = words[start_idx:end_idx]
            context_words.remove(word)  # Remove the word itself from the context

            # Update co-occurrence counts
            if category == "israel":
                for context_word in context_words:
                    if context_word in PALESTINE_MEMBER_AFFILIATIONS or context_word in stopwords or len(context_word) < 3:
                        continue
                    context_word = context_word.lower()
                    israel_co_occurrences[context_word] = israel_co_occurrences.get(context_word, 0) + 1
            elif category == "palestine":
                for context_word in context_words:
                    if context_word in ISRAEL_MEMBER_AFFILIATIONS or context_word in stopwords or len(context_word) < 3:
                        continue
                    context_word = context_word.lower()
                    palestine_co_occurrences[context_word] = palestine_co_occurrences.get(context_word, 0) + 1

    return israel_co_occurrences, palestine_co_occurrences

In [None]:
from tabulate import tabulate

def print_top_10(dictionary, title):
    # Sort the dictionary by values in descending order and take the top 10 items
    sorted_dict = dict(sorted(dictionary.items(), key=lambda item: item[1], reverse=True)[:10])

    # Convert the dictionary to a list of lists for tabulate
    table_data = [[word, count] for word, count in sorted_dict.items()]

    # Print the table
    print(f"Top 10 {title}:")
    print(tabulate(table_data, headers=['Word', 'Count'], tablefmt='grid'))

In [None]:
# Initialize co-occurrence counters
israel_co_occurrences = {}
palestine_co_occurrences = {}

# Iterate through sentences to find co-occurrences
# Loop through each article
for index, row in df.iterrows():

    # Initialize variables to track presence of keywords
    antisemitic_present = False
    islamophobic_present = False

    results_file = row['results_file']
    article_file = row['article_file']

    filename = root_directory + results_file

    try:
        with open(filename) as d:
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()


    sentences = data["sentences"]
    isr_word_counts, pal_word_counts = cooccurences(3, israel_co_occurrences, palestine_co_occurrences)

In [None]:
sorted_isr_counts = dict(sorted(isr_word_counts.items(), key=lambda item: item[1], reverse=True))
sorted_isr_counts

In [None]:
sorted_pal_counts = dict(sorted(pal_word_counts.items(), key=lambda item: item[1], reverse=True))
sorted_pal_counts

In [None]:
print_top_10(sorted_pal_counts, "Palestine Word Co-occurrences")

In [None]:
print_top_10(sorted_isr_counts, "Israel Word Co-occurrences")

In [None]:
def find_most_different_cooccurrences(dict1, dict2, threshold=0):
    # Initialize dictionaries to store words with the most drastic differences
    dict1_more = {}
    dict2_more = {}

    # Iterate through each word in the first dictionary
    for word, count1 in dict1.items():
        if word in PALESTINE_MEMBER_AFFILIATIONS or word in ISRAEL_MEMBER_AFFILIATIONS:
            continue

        # Get the count of the word in the second dictionary (default to 0 if word not present)
        count2 = dict2.get(word, 0)

        # Calculate the difference in counts
        difference = count1 - count2

        # Check if the difference exceeds the threshold
        if difference > threshold:
            dict1_more[word] = difference

    # Iterate through each word in the second dictionary
    for word, count2 in dict2.items():
        if word in PALESTINE_MEMBER_AFFILIATIONS or word in ISRAEL_MEMBER_AFFILIATIONS:
            continue
        # Get the count of the word in the first dictionary (default to 0 if word not present)
        count1 = dict1.get(word, 0)

        # Calculate the difference in counts
        difference = count2 - count1

        # Check if the difference exceeds the threshold
        if difference > threshold:
            dict2_more[word] = difference

    # Sort the dictionaries by values in descending order
    sorted_dict1_more = dict(sorted(dict1_more.items(), key=lambda item: item[1], reverse=True))
    sorted_dict2_more = dict(sorted(dict2_more.items(), key=lambda item: item[1], reverse=True))

    return (sorted_dict1_more, sorted_dict2_more)

In [None]:
# Call the function to find words with the most different co-occurrences
more_pal, more_isr = find_most_different_cooccurrences(pal_word_counts, isr_word_counts)


In [None]:
more_pal

In [None]:
more_isr

In [None]:
print_top_10(more_pal, "Words occurring more in context with Palestine")
print_top_10(more_isr, "Words occurring more in context with Israel")

In [None]:
def analyze_sentiment_and_plot(word_counts, title):
    # Initialize the VADER sentiment analyzer
    sid = SentimentIntensityAnalyzer()

    # Analyze sentiment for each word and collect compound scores
    compound_scores = []
    for word, count in word_counts.items():
        # Perform sentiment analysis on the word
        sentiment_scores = sid.polarity_scores(word)
        # Append the compound score for the word
        compound_scores.extend([sentiment_scores['compound']] * count)

    # Plot histogram
    plt.figure(figsize=(10, 6))
    plt.hist(compound_scores, bins=10, edgecolor='black')
    plt.xlabel('Sentiment Score (Compound)')
    plt.ylabel('Frequency')
    plt.title(title)
    plt.show()


In [None]:
analyze_sentiment_and_plot(active_verb_counts['israel'], 'Sentiment Score Distribution- Active Verbs, Israel')

In [None]:
analyze_sentiment_and_plot(active_verb_counts['palestine'], 'Sentiment Score Distribution- Active Verbs, Palestine')

In [None]:
analyze_sentiment_and_plot(passive_verb_counts['israel'], 'Sentiment Score Distribution- Passive Verbs, Israel')

In [None]:
analyze_sentiment_and_plot(passive_verb_counts['palestine'], 'Sentiment Score Distribution- Passive Verbs, Palestine')

In [None]:
from wordcloud import WordCloud
def generate_and_plot_wordcloud(word_counts, title):
    # Generate word cloud from word counts
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_counts)

    # Plot the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title(title)
    plt.axis("off")
    plt.show()


In [None]:
generate_and_plot_wordcloud(active_verb_counts['israel'], "Word Cloud- Active Verbs, Israel")

In [None]:
generate_and_plot_wordcloud(active_verb_counts['palestine'], "Word Cloud- Active Verbs, Palestine")

In [None]:
generate_and_plot_wordcloud(passive_verb_counts['israel'], "Word Cloud- Passive Verbs, Israel")

In [None]:
generate_and_plot_wordcloud(passive_verb_counts['palestine'], "Word Cloud- Passive Verbs, Palestine")

In [None]:
def get_top_ten_words(word_counts):
    top_ten_words = Counter(word_counts).most_common(10)
    return top_ten_words

In [None]:
get_top_ten_words(passive_verb_counts['palestine'])

In [None]:
get_top_ten_words(passive_verb_counts['israel'])

In [None]:
get_top_ten_words(active_verb_counts['palestine'])

In [None]:
get_top_ten_words(active_verb_counts['israel'])

In [None]:
def plot_pie_chart(data, title):
    labels = data.keys()
    sizes = data.values()
    colors = ['lightblue', 'lightgreen', 'lightcoral']

    plt.figure(figsize=(8, 6))
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.title(title)
    plt.show()

In [None]:
plot_pie_chart(active_passive_counts["israel"], "Israeli Affiliates")

In [None]:
plot_pie_chart(active_passive_counts["palestine"], "Palestinian Affiliates")

In [None]:
active_passive_counts["israel"]

In [None]:
def plot_keyword_counts(df, title='Keyword Counts Over Weeks'):
    # Plot the data
    df.plot(kind='line')

    # Set labels and title
    plt.xlabel('Week since Oct 7')
    plt.ylabel('Count')
    plt.title(title)

    # Show the legend
    plt.legend(title='Category')

    # Show grid
    plt.grid(True)

    # Show the plot
    plt.show()

In [None]:
df = pd.read_csv('keyword_counts_no_affiliates.csv')
plot_keyword_counts(df)

In [None]:
df = pd.read_csv('keyword_counts_incl_affiliates.csv')
plot_keyword_counts(df)

In [None]:
pd.read_csv('keyword_counts_no_affiliates.csv')

In [None]:
pd.read_csv('keyword_counts_incl_affiliates.csv')

In [None]:
import matplotlib.pyplot as plt

# Data
categories = ['Antisemitism only', 'Both Antisemitism and Islamophobia', 'Islamophobia only']
counts = [94, 40, 7]
colors = ['#FF5733', '#6A5ACD', '#4682B4']  # Nice color palette

# Create bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(categories, counts, color=colors)

# Add counts above the bars
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(count),
             ha='center', va='bottom', fontsize=12)

# Add labels and title
plt.xlabel('Categories')
plt.ylabel('Number of Articles')
plt.title('BBC Articles mentioning Antisemitism and/or Islamophobia', fontsize=16)

# Show plot
plt.tight_layout()
plt.show()



In [None]:
# Data- from feb
categories = ['Antisemitic Incidents- CST', 'Islamophobia incidents- MAMA']
counts = [2699, 2010]
colors = ['#FF5733', '#4682B4']

# Create bar chart
plt.figure(figsize=(8, 6))
plt.bar(categories, counts, color=colors)

# Add counts above the bars
for i in range(len(categories)):
    plt.text(i, counts[i] + 1, str(counts[i]), ha='center')

# Add labels and title
plt.ylabel('Number of Reports')
plt.title('Reports of Antisemitism and Islamophobia in the UK')

plt.show()