# Prepare the Environment

## To prepare the environment, uncomment the below cell and run.

In [2]:
# !pip install numpy==1.26.4
# !pip install pandas
# !pip install gensim
# !pip install tensorflow
# 
# !pip install datasets
# !pip install nltk
# !pip install scikit-learn
# !pip install --upgrade ipywidgets
# !pip install --upgrade jupyter

# Step 1: Data Preprocessing
    Prerequisites:
    Basic Python programming
    Understanding of text processing techniques
    Tasks:
    Load the BEAD dataset, focusing on the "1-Text-Classification" folder.
    Implement text cleaning functions:
    Split the dataset into training and validation sets.

### Task 1: Load the BEAD dataset, focusing on the "1-Text-Classification" folder.

In [3]:
import pandas as pd

from datasets import load_dataset

ds = load_dataset("shainar/BEAD", "3-Aspects")

# Load the bias training data.
splits = {'bias_train': '1-Text-Classification/bias-train.csv', 'bias_valid': '1-Text-Classification/bias-valid.csv', 'bias_train_instruction': '1-Text-Classification/bias-train_10k-instruction-data.csv', 'bias_valid_instruction': '1-Text-Classification/bias-valid_1k-instruction-data.csv', 'sentiment_train': '1-Text-Classification/sentiment-train.csv', 'sentiment_valid': '1-Text-Classification/sentiment-valid.csv', 'toxic_train': '1-Text-Classification/toxic-train.csv', 'toxic_valid': '1-Text-Classification/toxic-valid.csv', 'multi_label': '1-Text-Classification/multi-label.csv'}

# splits_3 = {'bias_tokens': '2-Token-Classification/Bias_tokens.csv', 'conll': '2-Token-Classification/conll.csv', 'conll_bias': '2-Token-Classification/bias-conll.csv'}

# df_1 = pd.read_csv("hf://datasets/shainar/BEAD/0-Full Annotations/Full.csv")

# Read the bias training data
df = pd.read_csv("hf://datasets/shainar/BEAD/" + splits["bias_train"])

# df_3 = pd.read_csv("hf://datasets/shainar/BEAD/" + splits_3["bias_tokens"])

# print(ds.keys())  # This will show you all available splits like 'train', 'validation', etc.

ds_cleaned = ds['aspects'].to_pandas().copy()
#df_cleaned = df.copy()

# Preview the data
print("Dataset 1: ")

print(ds_cleaned.head()) 

#print("Dataset 2: ")
#print(df_cleaned.head())

# print("Dataset 3: ")
# print(df_3.head())

Dataset 1: 
                                                text   Aspect
0  why does everyone always have to end up leavin...  abandon
1  Get out of a sick bed, log onto Twitter and se...  abandon
2          I've been abandoned with nothing to eat.   abandon
3  @rustyrockets I'm not being funny, but where a...  abandon
4        i'm all alone because @smt90210 left me...   abandon


### Task 2: Implement text cleaning functions & Split the datasets into training and validation sets.

In [4]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace all non-alphabetic characters (punctuation, numbers, etc.) with spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize and remove stopwords using sklearn stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    
    # Remove single-letter words
    filtered_words = [word for word in filtered_words if len(word) > 1]
    
    # Rejoin the filtered words into a cleaned-up string
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text


# Remove rows where 'text' column is empty or contains only whitespace
ds_cleaned = ds_cleaned[ds_cleaned['text'].str.strip().astype(bool)]
#df_cleaned = df_cleaned[df_cleaned['text'].str.strip().astype(bool)]

# Remove rows where 'label' column is not 0 or 1
#df_cleaned = df_cleaned[df_cleaned['label'].isin([0, 1])]

# Apply the text cleaning function to the 'text' column
ds_cleaned['processed_text'] = ds_cleaned['text'].apply(clean_text)
#df_cleaned['processed_text'] = df_cleaned['text'].apply(clean_text)

# Tokenize the cleaned text using NLTK's word_tokenize
ds_cleaned['tokens'] = ds_cleaned['processed_text'].apply(word_tokenize)
#df_cleaned['tokens'] = df_cleaned['processed_text'].apply(word_tokenize)

# Display the cleaned data
print("Processed Data_1:", ds_cleaned.head())
#print("Processed Data_2:", df_cleaned.head())


Processed Data_1:                                                 text   Aspect  \
0  why does everyone always have to end up leavin...  abandon   
1  Get out of a sick bed, log onto Twitter and se...  abandon   
2          I've been abandoned with nothing to eat.   abandon   
3  @rustyrockets I'm not being funny, but where a...  abandon   
4        i'm all alone because @smt90210 left me...   abandon   

                                      processed_text  \
0                                   does end leaving   
1  sick bed log twitter ve purged abandoned twitt...   
2                                   ve abandoned eat   
3    rustyrockets funny abandoned fair sigh xxxxxxxx   
4                                           smt left   

                                              tokens  
0                               [does, end, leaving]  
1  [sick, bed, log, twitter, ve, purged, abandone...  
2                               [ve, abandoned, eat]  
3  [rustyrockets, funny, abandoned

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import words

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Load the list of valid English words
valid_words = set(words.words())

# Convert NLTK's part-of-speech tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Check if the word is a valid word in the vocabulary
def is_valid_word(word):
    return word in valid_words or wordnet.synsets(word)

# Define a function to perform part-of-speech tagging, lemmatization, and validity check
def lemmatize_tokens(tokens):
    pos_tagged = pos_tag(tokens)  # POS tagging for each word
    lemmatized = []
    for token, tag in pos_tagged:
        # Lemmatize the word based on its POS tag
        lemma = lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        # Check if the lemmatized word is valid
        if is_valid_word(lemma):
            lemmatized.append(lemma)
        else:
            lemmatized.append("XXXXX")  # Use "XXXXX" to mark invalid words
    return [word for word in lemmatized if word != "XXXXX"]  # Remove "XXXXX" words

# Apply lemmatization to the 'tokens' column of each row
ds_cleaned['lemmatized_tokens'] = ds_cleaned['tokens'].apply(lemmatize_tokens)
#df_cleaned['lemmatized_tokens'] = df_cleaned['tokens'].apply(lemmatize_tokens)

# Display the processed data
print(ds_cleaned[['processed_text', 'tokens', 'lemmatized_tokens']].head())
#print(df_cleaned[['processed_text', 'tokens', 'lemmatized_tokens']].head())


In [None]:
from sklearn.model_selection import train_test_split

# Separate datasets for ds_cleaned
X_ds = ds_cleaned['lemmatized_tokens']
y_ds = ds_cleaned['Aspect']
X_train_ds, X_test_ds, y_train_ds, y_test_ds = train_test_split(X_ds, y_ds, test_size=0.2, random_state=42)
train_ds = pd.DataFrame({'lemmatized_tokens': X_train_ds, 'Aspect': y_train_ds})
test_ds = pd.DataFrame({'lemmatized_tokens': X_test_ds, 'Aspect': y_test_ds})

# # Separate datasets for df_cleaned
# X_df = df_cleaned['lemmatized_tokens']  # Features
# y_df = df_cleaned['label']              # Labels (assume there is a label column)
# X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)
# train_df = pd.DataFrame({'lemmatized_tokens': X_train_df, 'label': y_train_df})
# test_df = pd.DataFrame({'lemmatized_tokens': X_test_df, 'label': y_test_df})


train_size_1 = len(train_ds)
test_size_1 = len(test_ds)
total_size_1 = len(train_ds) + len(test_ds)

# train_size_2 = len(train_df)
# test_size_2 = len(test_df)
# total_size_2 = len(train_df) + len(test_df)

# View the split datasets
print("Dataset DS:")
print(f"Number of Training Data Size: {train_size_1}")
print(f"Percentage: {(train_size_1 / total_size_1):.2f}")
print(f"Number of Test Data Size: {test_size_1}")
print(f"Percentage: {(test_size_1 / total_size_1):.2f}")
# print()
# print("Dataset DF:")
# print(f"Number of Training Data Size: {train_size_2}")
# print(f"Percentage: {(train_size_2 / total_size_2):.2f}")
# print(f"Number of Test Data Size: {test_size_2}")
# print(f"Percentage: {(test_size_2 / total_size_2):.2f}")


# Step 2: Feature Extraction

### Task 1: Implement TF-IDF vectorization on the cleaned text data

# Custom TF-IDF

In [None]:
from collections import Counter
import math

# Compute Term Frequency (TF) for a row of tokens
def compute_term_frequency(row_tokens):
    term_frequency = Counter(row_tokens)  # Count occurrences of each word
    total_token_count = len(row_tokens)  # Count total number of words in the row
    # Compute the relative frequency of each word (Term Frequency)
    term_frequency = {word: count / total_token_count for word, count in term_frequency.items()}
    return term_frequency 

# Compute Inverse Document Frequency (IDF) for the entire corpus
def compute_inverse_document_frequency(corpus):
    number_of_rows = len(corpus)
    row_frequency = Counter()  # Record how many rows each word appears in

    # Loop over each row in the corpus
    for row_tokens in corpus:
        unique_tokens_in_row = set(row_tokens)  # Get unique words in the row
        # Count each word that appears at least once in the row
        for word in unique_tokens_in_row:
            row_frequency[word] += 1

    # Compute Inverse Document Frequency for each word
    inverse_document_frequency = {
        word: math.log(number_of_rows / (1 + row_count)) 
        for word, row_count in row_frequency.items()
    }
    return inverse_document_frequency

# Compute Term Frequency-Inverse Document Frequency (TF-IDF) for the entire corpus
def compute_term_frequency_inverse_document_frequency(corpus):
    # Compute IDF for the whole corpus
    inverse_document_frequency = compute_inverse_document_frequency(corpus)
    
    tfidf_scores_for_rows = []
    # Loop over each row in the corpus
    for row_tokens in corpus:
        # Compute the term frequency for the row
        term_frequency = compute_term_frequency(row_tokens)
        # Compute TF-IDF for each word in the row
        tfidf = {
            word: term_frequency.get(word, 0) * inverse_document_frequency.get(word, 0)
            for word in term_frequency
        }
        tfidf_scores_for_rows.append(tfidf)
    
    return tfidf_scores_for_rows

In [None]:
# Example usage on ds_cleaned (assuming it's a Hugging Face Dataset)
corpus_ds_cleaned = ds_cleaned['lemmatized_tokens']  # This will be the list of lemmatized tokens for each row
tfidf_scores_ds_cleaned = compute_term_frequency_inverse_document_frequency(corpus_ds_cleaned)

# # Example usage on df_cleaned (assuming it's a pandas DataFrame)
# corpus_df_cleaned = df_cleaned['lemmatized_tokens'].tolist()  # Convert the column to a list of lists
# tfidf_scores_df_cleaned = compute_term_frequency_inverse_document_frequency(corpus_df_cleaned)

# Print results for verification (example for the first few rows)
print("TF-IDF scores for ds_cleaned (first row):")
print(tfidf_scores_ds_cleaned[0])
print("TF-IDF scores for ds_cleaned (second row):")
print(tfidf_scores_ds_cleaned[1])
print("TF-IDF scores for ds_cleaned (third row):")
print(tfidf_scores_ds_cleaned[2])


# print("TF-IDF scores for df_cleaned (first row):")
# print(tfidf_scores_df_cleaned[0])

### (OPTIONAL)Task 2: Explore word embedding techniques (e.g., Word2Vec or GloVe) for more advanced feature representation.

In [None]:
# from gensim.models import Word2Vec
# 
# # Example: Train Word2Vec model on your lemmatized tokens (list of tokens)
# model = Word2Vec(df_cleaned['lemmatized_tokens'], vector_size=100, window=5, min_count=1, workers=4)
# 
# # Save the model for later use
# model.save("word2vec_model.model")
# 
# # Example: Find the vector for a word
# word_vector = model.wv['good']  # Replace 'example' with any word in your dataset
# 
# # Find similar words
# similar_words = model.wv.most_similar('good', topn=10)
# print(similar_words)


# Step 3: Topic Modeling

### Tasks: 
* Implement LDA using the gensim library to identify main topics in the articles.
* Analyze the topics to understand potential areas of bias.

# Custom LDA

In [1]:
import random

class CustomLDA:
    def __init__(self, corpus, num_topics=50, alpha=0.01, beta=0.1, num_iterations=100):
        self.corpus = corpus
        self.num_topics = num_topics
        self.alpha = alpha
        self.beta = beta
        self.num_iterations = num_iterations
        self.vocab = set(word for doc in corpus for word in doc)
        self.vocab_size = len(self.vocab)
        self.word_to_id = {word: i for i, word in enumerate(self.vocab)}
        self.id_to_word = {i: word for i, word in enumerate(self.vocab)}
        self.doc_topic_counts = np.zeros((len(corpus), num_topics))
        self.topic_word_counts = np.zeros((num_topics, self.vocab_size))
        self.topic_counts = np.zeros(num_topics)
        self.word_assignments = []
        for doc_idx, doc in enumerate(corpus):
            word_assignments_doc = []
            for word in doc:
                word_id = self.word_to_id[word]
                topic = random.randint(0, num_topics - 1)
                word_assignments_doc.append(topic)
                self.doc_topic_counts[doc_idx, topic] += 1
                self.topic_word_counts[topic, word_id] += 1
                self.topic_counts[topic] += 1
            self.word_assignments.append(word_assignments_doc)

    def sample_topic(self, doc_idx, word_idx, word_id):
        current_topic = self.word_assignments[doc_idx][word_idx]
        self.doc_topic_counts[doc_idx, current_topic] -= 1
        self.topic_word_counts[current_topic, word_id] -= 1
        self.topic_counts[current_topic] -= 1
        topic_probs = (self.doc_topic_counts[doc_idx] + self.alpha) * \
                      (self.topic_word_counts[:, word_id] + self.beta) / \
                      (self.topic_counts + self.beta * self.vocab_size)
        topic_probs /= topic_probs.sum()
        new_topic = np.random.choice(self.num_topics, p=topic_probs)
        self.doc_topic_counts[doc_idx, new_topic] += 1
        self.topic_word_counts[new_topic, word_id] += 1
        self.topic_counts[new_topic] += 1
        return new_topic

    def train(self):
        for iteration in range(self.num_iterations):
            print(f"Iteration {iteration + 1}/{self.num_iterations}")
            for doc_idx, doc in enumerate(self.corpus):
                for word_idx, word in enumerate(doc):
                    word_id = self.word_to_id[word]
                    new_topic = self.sample_topic(doc_idx, word_idx, word_id)
                    self.word_assignments[doc_idx][word_idx] = new_topic

    def print_topics(self, top_n=5):
        for topic_id in range(self.num_topics):
            word_probs = self.topic_word_counts[topic_id] / self.topic_counts[topic_id]
            top_word_ids = np.argsort(word_probs)[::-1][:top_n]
            top_words = [self.id_to_word[word_id] for word_id in top_word_ids]
            print(f"Topic {topic_id}: {', '.join(top_words)}")

    def get_document_topics(self, top_n=5):
        doc_topic_probs = (self.doc_topic_counts + self.alpha) / \
                          (self.doc_topic_counts.sum(axis=1)[:, None] + self.alpha * self.num_topics)
        top_topics_per_doc = []
        for doc_idx in range(len(self.corpus)):
            topic_probs = doc_topic_probs[doc_idx]
            top_topic_ids = np.argsort(topic_probs)[::-1][:top_n]
            top_topic_probs = topic_probs[top_topic_ids]
            top_topics_per_doc.append([(topic_id, prob) for topic_id, prob in zip(top_topic_ids, top_topic_probs)])
        return top_topics_per_doc

# Example usage
lda = CustomLDA(ds_cleaned['lemmatized_tokens'], num_topics=2, num_iterations=2)
lda.train()

# Print top words for each topic
# Print the top words for each topic
print("\nIdentified Topics:")
lda.print_topics(top_n=5)


NameError: name 'ds_cleaned' is not defined

# Step 4: Bias Detection Model Development
### Prerequisites:
* Understanding of machine learning algorithms (e.g., logistic regression, random forests)
* Knowledge of deep learning concepts and libraries (e.g., TensorFlow or PyTorch)

### Logistic Regression Model for Bias Detection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)

# Train the model using the training data (TF-IDF features)
logreg_model.fit(tfidf_features[:train_size], y_train)

# Predict on the test set
y_pred = logreg_model.predict(tfidf_features[train_size:])

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report for detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### Neural Network Model for Bias Detection

In [None]:
# Install necessary libraries (run these commands separately in a notebook cell)
!pip install tensorflow
!pip install matplotlib

import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import LabelEncoder

# Ensure TensorFlow is working
print(tf.__version__)

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Pad the sequences to make them of equal length (make sure tfidf_features is available)
max_len = 100  # Set a maximum length for padding

# Ensure tfidf_features is a 2D array or matrix of TF-IDF features
X_train_padded = pad_sequences(tfidf_features[:train_size], maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(tfidf_features[train_size:], maxlen=max_len, padding='post', truncating='post')

# Build the neural network model
nn_model = Sequential()

# Add an embedding layer (if using word embeddings, else skip if TF-IDF features are used directly)
# Embedding layer is typically not used with TF-IDF; remove if unnecessary
nn_model.add(Embedding(input_dim=tfidf_features.shape[1], output_dim=100))

# Add a spatial dropout layer
nn_model.add(SpatialDropout1D(0.2))

# Add an LSTM layer
nn_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

# Add a dense output layer with a sigmoid activation function (for binary classification)
nn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
nn_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
history = nn_model.fit(X_train_padded, y_train_encoded, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test_encoded))

# Evaluate the model on the test set
test_loss, test_accuracy = nn_model.evaluate(X_test_padded, y_test_encoded)
print(f"\nTest Accuracy: {test_accuracy:.2f}")

# Optionally, print the training history
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


# Step 5: Token-level Bias Identification
### Tasks:
* Use the "2-Token-Classification" folder from the BEAD dataset.
* Implement a BiLSTM-CRF model for identifying biased words or phrases within the text.
* Train and evaluate the token-level bias detection model.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Data Preprocessing
# Convert the tokens to sequences using Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True, oov_token="<UNK>")
tokenizer.fit_on_texts(df['lemmatized_text'])  # Assuming 'lemmatized_text' is the column with processed text

# Convert text data into sequences of integers
X_sequences = tokenizer.texts_to_sequences(df['lemmatized_text'])

# Pad the sequences to ensure they have the same length
max_len = 100  # You can adjust this based on your data
X_padded = pad_sequences(X_sequences, padding='post', maxlen=max_len)

# Encode labels (assume binary classification, modify if needed)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['label'])

# 2. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# 3. Build the BiLSTM Model
from tensorflow.keras import layers, models

# Create the BiLSTM model
model = models.Sequential()

# Add an embedding layer (using the tokenizer's word index)
model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100))

# Add a BiLSTM layer
model.add(layers.Bidirectional(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2)))

# Add a Dense layer for output prediction (binary classification)
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 4. Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# 5. Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.2f}")

# Optionally, plot the training history
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


# Step 6: Bias Categorization
### Tasks:
* Develop a multi-label classification model to categorize different types of bias (e.g., gender, racial, political).
* Train the model using the data from the "3-Aspects" folder.
* Evaluate the model's performance in categorizing bias types.

In [None]:
!pip install tensorflow-addons==0.15.0



In [None]:
from tensorflow.keras import layers, models
import tensorflow_addons as tfa  # Ensure to import tensorflow_addons for CRF

# Define the input layer with shape corresponding to the padded sequences
input_layer = layers.Input(shape=(X_train.shape[1],))

# Add an embedding layer that uses the tokenizer's word index
embedding_layer = layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X_train.shape[1])(input_layer)

# Add a BiLSTM layer to process the sequence data
bilstm_layer = layers.Bidirectional(layers.LSTM(100, return_sequences=True))(embedding_layer)

# Add a CRF layer for sequence labeling
crf_layer = tfa.layers.CRF(len(token_labels))  # Ensure to pass the correct number of tags/labels

# Apply CRF layer on top of the BiLSTM output
output_layer = crf_layer(bilstm_layer)

# Build the model by specifying input and output layers
model = models.Model(inputs=input_layer, outputs=output_layer)

# Compile the model using Adam optimizer, CRF loss, and CRF accuracy metrics
model.compile(optimizer='adam', loss=crf_layer.loss, metrics=[crf_layer.accuracy])

# Display the model summary to check the architecture
model.summary()
