In [None]:
import re
from nltk.corpus import stopwords

file_path = "sorted_aligned_bible_final.tsv" 
bible_data = pd.read_csv(file_path, sep="\t")

# Load stop words
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def clean_text(text):
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply cleaning function to each text (e.g., each book or chapter)
bible_data['cleaned_text'] = bible_data['Text'].apply(clean_text)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Group the data by 'Version' and 'Book', and aggregate the text by concatenating verses for each book
grouped_books = bible_data.groupby(['Version', 'Book'])['Text'].apply(lambda x: " ".join(x)).reset_index()

# Initialize the TfidfVectorizer to create a TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20)  # Limit to top 20 important terms
X_tfidf = tfidf_vectorizer.fit_transform(grouped_books['Text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add 'Version' and 'Book' columns to the DataFrame for reference
tfidf_df['Version'] = grouped_books['Version']
tfidf_df['Book'] = grouped_books['Book']

# Apply VADER sentiment analysis to each book's aggregated text
analyzer = SentimentIntensityAnalyzer()

sentiment_results = []
for _, row in grouped_books.iterrows():
    # Text of the book
    text = row['Text']
    
    # Apply VADER sentiment analysis
    sentiment_score = analyzer.polarity_scores(text)
    
    # Append the result to the sentiment_results list
    sentiment_results.append({
        'Version': row['Version'],
        'Book': row['Book'],
        'Sentiment Score': sentiment_score['compound']
    })

# Convert sentiment results into a DataFrame
sentiment_df = pd.DataFrame(sentiment_results)

# Combine the TDM with sentiment analysis results
tdm_sentiment_df = pd.merge(tfidf_df, sentiment_df, on=['Version', 'Book'], how='left')

# Display the final dataframe with TDM and Sentiment Scores
print(tdm_sentiment_df.head())  

tdm_sentiment_df.head() 





     christ      come     earth       god      good      hath     jesus  \
0  0.088992  0.185081  0.416144  0.350715  0.003262  0.195990  0.053845   
1  0.000000  0.032488  0.011819  0.048960  0.013433  0.068606  0.000000   
2  0.022038  0.025533  0.088778  0.202838  0.025947  0.177926  0.000776   
3  0.000000  0.060425  0.000000  0.033823  0.009280  0.167278  0.000000   
4  0.027936  0.038574  0.101354  0.264507  0.069118  0.311466  0.028523   

       king       law       let  ...     shall      thee    things      thou  \
0  0.024931  0.000000  0.059948  ...  0.556330  0.154375  0.150220  0.369093   
1  0.939443  0.011276  0.038404  ...  0.052070  0.080058  0.059286  0.234231   
2  0.035926  0.026830  0.082546  ...  0.352251  0.213557  0.032109  0.417819   
3  0.000000  0.233695  0.028426  ...  0.083935  0.422933  0.000000  0.619717   
4  0.000000  0.290087  0.197596  ...  0.500107  0.024230  0.066314  0.384630   

        thy      unto        ye  Version  \
0  0.280473  0.143498  0

Unnamed: 0,christ,come,earth,god,good,hath,jesus,king,law,let,...,shall,thee,things,thou,thy,unto,ye,Version,Book,Sentiment Score
0,0.088992,0.185081,0.416144,0.350715,0.003262,0.19599,0.053845,0.024931,0.0,0.059948,...,0.55633,0.154375,0.15022,0.369093,0.280473,0.143498,0.027822,DRB,The Apocalypse of St. John the Apostle,1.0
1,0.0,0.032488,0.011819,0.04896,0.013433,0.068606,0.0,0.939443,0.011276,0.038404,...,0.05207,0.080058,0.059286,0.234231,0.154007,0.023638,0.018332,DRB,The Book of Esther,0.9998
2,0.022038,0.025533,0.088778,0.202838,0.025947,0.177926,0.000776,0.035926,0.02683,0.082546,...,0.352251,0.213557,0.032109,0.417819,0.670371,0.079956,0.073126,DRB,The Book of Psalms,1.0
3,0.0,0.060425,0.0,0.033823,0.00928,0.167278,0.0,0.0,0.233695,0.028426,...,0.083935,0.422933,0.0,0.619717,0.514235,0.054434,0.047492,DRB,The Book of Ruth,0.9989
4,0.027936,0.038574,0.101354,0.264507,0.069118,0.311466,0.028523,0.0,0.290087,0.197596,...,0.500107,0.02423,0.066314,0.38463,0.07924,0.020271,0.165065,DRB,The Catholic Epistle of St. James the Apostle,0.9997
