In [None]:
from transformers import pipeline
import pandas as pd

# Load the Bible dataset
bible_data = pd.read_csv("chapter5_data/sorted_aligned_bible_final.tsv", sep="\t")

# Initialize the sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", truncation=True)

# Function to perform sentiment analysis on the 'Text' column
def analyze_sentiment(text):
    result = sentiment_analysis(text)
    return result[0]['label'], result[0]['score']

# Apply sentiment analysis to the dataset
bible_data['sentiment'], bible_data['sentiment_score'] = zip(*bible_data['Text'].apply(analyze_sentiment))

# Step 1: Count NEGATIVE and POSITIVE labels for each Index, Book, and Version
sentiment_counts = bible_data.groupby(['Index', 'Book', 'Version', 'sentiment']).size().unstack(fill_value=0).reset_index()

# Step 2: Calculate the positive-to-negative ratio
sentiment_counts['positive_to_negative_ratio'] = sentiment_counts['POSITIVE'] / (sentiment_counts['NEGATIVE'] + 1)

# Step 3: Save results to a CSV file
# Ensure columns are ordered as requested
sentiment_counts = sentiment_counts[['Index', 'Book', 'Version', 'NEGATIVE', 'POSITIVE', 'positive_to_negative_ratio']]
sentiment_counts.to_csv("sentiment_counts_with_ratios.csv", index=False)

# Display a preview of the structured DataFrame
print(sentiment_counts.head())


In [None]:
# Specify the index you want to analyze
target_index = 19 

# Filter the DataFrame for the specified index and include the Book column
verse_results = bible_data[bible_data['Index'] == target_index][['Version', 'Book', 'Chapter', 'Verse', 'Text', 'sentiment', 'sentiment_score']]

# Print the results for the verses in the specified index
print(f"Verse-level sentiment results for Index {target_index}:")
print(verse_results)

# Save the verse-specific results to a CSV file, including the Book column
verse_results.to_csv(f"chapter5_data/verse_sentiment_results_index_{target_index}.csv", index=False)


In [None]:
# Specify the target index
target_index = 19 

# Filter for the specified index and negative sentiment
negative_verses_index = bible_data[
    (bible_data['Index'] == target_index) & (bible_data['sentiment'] == 'NEGATIVE')
]

# Sort the filtered data by Version, Book, Chapter, and Verse
ordered_negative_verses_index = negative_verses_index.sort_values(by=['Version', 'Book', 'Chapter', 'Verse'])

# Reorder the columns for the desired output
ordered_negative_verses_index = ordered_negative_verses_index[['Version', 'Book', 'Chapter', 'Verse', 'Text', 'sentiment', 'sentiment_score']]

# Display the first few rows to verify
print(f"Negative verses for Index {target_index}, grouped by Version:")
print(ordered_negative_verses_index.head())


# Save to a CSV file
ordered_negative_verses_index.to_csv("chapter5_data/verse_sentiment_results_index19_negative_verses.csv", index=False)


Negative verses for Index 19, grouped by Version:
      Version                                        Book  Chapter  Verse  \
20921     DRB  The Second Epistle of St. John the Apostle        1      7   
20923     DRB  The Second Epistle of St. John the Apostle        1      9   
20934     KJV          The Second Epistle General of John        1      7   
20936     KJV          The Second Epistle General of John        1      9   
20937     KJV          The Second Epistle General of John        1     10   

                                                                                                                                                                    Text  \
20921                         For many seducers are gone out into the world who confess not that Jesus Christ is come in the flesh. This is a seducer and an antichrist.   
20923      Whosoever revolteth and continueth not in the doctrine of Christ hath not God. He that continueth in the doctrine, the same hath bot

Sentiment Analysis over entire books

In [None]:
import pandas as pd

# Load the dataset
bible_data = pd.read_csv("chapter5_data/sorted_aligned_bible_final.tsv", sep="\t")

# Group the verses by Book and Version, concatenating them into a single string
book_texts = bible_data.groupby(['Book', 'Version'])['Text'].apply(' '.join).reset_index()




In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer

# Load your dataset
# Ensure your dataset has 'Book', 'Version', and 'Text' columns
bible_data = pd.read_csv("chapter5_data/sorted_aligned_bible_final.tsv", sep="\t")

# Initialize the sentiment analysis pipeline and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_analysis = pipeline("sentiment-analysis", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to chunk text by tokens
def chunk_by_tokens(text, tokenizer, max_length=512):
    # Tokenize the full text without truncation
    tokens = tokenizer(text, truncation=False, padding=False, return_tensors="pt")
    input_ids = tokens['input_ids'][0]  # Extract token IDs

    # Ensure proper chunking into max_length-sized slices
    chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]
    
    # Decode token chunks back into text for processing
    chunked_texts = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
    return chunked_texts

# Function to analyze sentiment for large texts
def analyze_large_text(text, tokenizer, sentiment_analysis, max_length=512):
    # Chunk the text into manageable pieces
    chunked_texts = chunk_by_tokens(text, tokenizer, max_length)
    sentiments = []
    
    # Process each chunk
    for chunk_text in chunked_texts:
        try:
            result = sentiment_analysis(chunk_text, truncation=True)[0]  # Analyze sentiment
            sentiments.append(result)
        except Exception as e:
            print(f"Error processing chunk: {chunk_text[:50]}... | Error: {e}")
    
    # Aggregate results
    positive = sum(1 for r in sentiments if r['label'] == 'POSITIVE')
    negative = sum(1 for r in sentiments if r['label'] == 'NEGATIVE')
    avg_score = sum(r['score'] for r in sentiments) / len(sentiments) if sentiments else 0
    dominant_sentiment = 'POSITIVE' if positive > negative else 'NEGATIVE'
    
    return dominant_sentiment, avg_score

# Prepare dataset: Combine verses by Book and Version
book_texts = bible_data.groupby(['Book', 'Version'])['Text'].apply(' '.join).reset_index()

# Apply sentiment analysis to each book
results = []
for _, row in book_texts.iterrows():
    book = row['Book']
    version = row['Version']
    text = row['Text']
    print(f"Analyzing Book: {book} | Version: {version}...")  # Progress update
    dominant_sentiment, avg_score = analyze_large_text(text, tokenizer, sentiment_analysis)
    results.append({'Book': book, 'Version': version, 'Sentiment': dominant_sentiment, 'Average Score': avg_score})

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv("chatper5_data/book_sentiment_analysis.csv", index=False)

# Preview the results
print(results_df.head())


Token indices sequence length is longer than the specified maximum sequence length for this model (12620 > 512). Running this sequence through the model will result in indexing errors


Analyzing Book: 1 Corinthians | Version: OEB...
Analyzing Book: 1 Corinthians | Version: WEB...
Analyzing Book: 1 John | Version: OEB...
Analyzing Book: 1 John | Version: WEB...
Analyzing Book: 1 Peter | Version: OEB...
Analyzing Book: 1 Peter | Version: WEB...
Analyzing Book: 1 Thessalonians | Version: OEB...
Analyzing Book: 1 Thessalonians | Version: WEB...
Analyzing Book: 1 Timothy | Version: OEB...
Analyzing Book: 1 Timothy | Version: WEB...
Analyzing Book: 2 Corinthians | Version: OEB...
Analyzing Book: 2 Corinthians | Version: WEB...
Analyzing Book: 2 John | Version: OEB...
Analyzing Book: 2 John | Version: WEB...
Analyzing Book: 2 Peter | Version: OEB...
Analyzing Book: 2 Peter | Version: WEB...
Analyzing Book: 2 Thessalonians | Version: OEB...
Analyzing Book: 2 Thessalonians | Version: WEB...
Analyzing Book: 2 Timothy | Version: OEB...
Analyzing Book: 2 Timothy | Version: WEB...
Analyzing Book: 3 John | Version: OEB...
Analyzing Book: 3 John | Version: WEB...
Analyzing Book: Co

# Test for significance

In [None]:
import pandas as pd
import numpy as np

def automated_categorization(data):
    results = []

    # Iterate over each book
    for book in data['Book'].unique():
        book_data = data[data['Book'] == book]

        # Pairwise comparisons for each version
        for i, row_a in book_data.iterrows():
            for j, row_b in book_data.iterrows():
                if i >= j:  # Avoid duplicate and self-comparisons
                    continue

                # Extract data for pair of versions
                version_a = row_a['Version']
                version_b = row_b['Version']
                count_a, total_a = row_a['POSITIVE'], row_a['NEGATIVE'] + row_a['POSITIVE']
                count_b, total_b = row_b['POSITIVE'], row_b['NEGATIVE'] + row_b['POSITIVE']

                # Proportion calculations
                p_a = count_a / total_a
                p_b = count_b / total_b

                # Cohen's h
                h = 2 * (np.arcsin(np.sqrt(p_a)) - np.arcsin(np.sqrt(p_b)))

                # Categorization based on Cohen's h
                practical_significance = (
                    "Negligible difference" if abs(h) < 0.1 else
                    "Small difference" if abs(h) < 0.2 else
                    "Moderate difference" if abs(h) < 0.5 else
                    "Large difference"
                )

                # Append results
                results.append({
                    "Book": book,
                    "Version_A": version_a,
                    "Version_B": version_b,
                    "Proportion_A": p_a,
                    "Proportion_B": p_b,
                    "Effect Size (Cohen's h)": abs(h),
                    "Practical Significance": practical_significance
                })

    return pd.DataFrame(results)

# Load your dataset
sentiment_data = pd.read_csv("chapter5_data/sentiment_counts_by_book_and_version.csv")

# Run the categorization
automated_results = automated_categorization(sentiment_data)

# Save the results to a CSV file
automated_results.to_csv("chapter5_data/sa_significance_results.csv", index=False)
print("Results saved to sa_significance_results.csv")


Results saved to sa_significance_results.csv
