In [None]:
from pdfminer.high_level import extract_text

In [None]:
def extract_text_from_pdf(pdf_path, output_path):
    text = extract_text(pdf_path)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        f_out.write(text)

In [None]:

pdf_path = 'Sahih_Al-Bukhari.pdf'
output_path = 'Sahih_Bukhari_extracted.txt'

extract_text_from_pdf(pdf_path, output_path)

In [None]:
pdf_path = 'Sahih_Muslim.pdf'
output_path = 'Sahih_Muslim_extracted.txt'

extract_text_from_pdf(pdf_path, output_path)

# cleaning Al-Bukhari

In [None]:
bukhari = open('Sahih_Bukhari_extracted.txt')#.read()
hadees_no = {}

for i in bukhari:
    # print(i.rstrip())
    if 'Number' in i:
        hadees_no[i.strip()] = None
        # print(i)
    
# corpus = ""
# for line in bukhari:
#     corpus = "".join([corpus,line.strip()])
    # print(line.strip())

In [None]:
bukhari = open('Sahih_Bukhari_extracted.txt', 'r', encoding='utf-8')  # Ensure you provide the correct path

hadees_no = {}
capture_text = False
current_hadith = None
text_buffer = []
n1,n2 = 1,2
for line in bukhari:
    line = line.strip()
    if f'Number {n1}' in line:  # Detecting the start of a new Hadith
        if current_hadith is not None:  # Not the first Hadith; previous one's text capture is complete
            hadees_no[current_hadith] = '\n'.join(text_buffer)
            text_buffer = []  # Reset buffer for the next Hadith
            if f'Number {n2}' in line:  # If we've reached Hadith Number 2, stop capturing and break
                break
        current_hadith = line  # Update the current Hadith marker
        capture_text = True  # Start capturing text for the new Hadith
        n1+=1
        n2+=1
    elif capture_text:
        text_buffer.append(line.strip())  # Capture the line of text within a Hadith
    
# Don't forget to add the last captured Hadith if the loop ends before reaching the next marker
if text_buffer and current_hadith is not None:
    hadees_no[current_hadith] = '\n'.join(text_buffer)

bukhari.close()  # Always close the file when done

# Print the captured text for verification
for hadith, text in hadees_no.items():
    print(f"{hadith}:\n{text}\n\n---\n")


In [None]:
def clean_key(key):
    # Standardize the Hadith reference format
    parts = key.split(',')
    cleaned_parts = [part.strip() for part in parts]
    return ' '.join(cleaned_parts)

def clean_value(value):
    # Clean the Hadith text
    cleaned_text = ' '.join(value.strip().split())
    return cleaned_text

cleaned_hadees_no = {clean_key(key): clean_value(value) for key, value in hadees_no.items()}

# Display the cleaned keys and values for verification
for key, value in cleaned_hadees_no.items():
    print(f"{key}: {value}\n\n---\n")


In [None]:
import pandas as pd

structured_hadees_df = pd.DataFrame(cleaned_hadees_no,index=['Hadees']).T
# 
structured_hadees_df.index.name = 'Volume'
structured_hadees_df.head(2)

In [None]:
import re

def clean_hadees_text(text):
    # Remove Volume and Book references
    cleaned_text = re.sub(r'Volume \d+ - \d+ / \d+ SAHIH BUKHARI VOLUME \d+ > BOOK \d+: [A-Z ]+', '', text)
    
    # Remove parentheses and their contents
    cleaned_text = re.sub(r'\(.*?\)', '', cleaned_text)
    # Optionally, remove brackets and their contents if present
    # cleaned_text = re.sub(r'\[.*?\]', '', cleaned_text)
    
    # Remove new lines and excessive whitespace
    cleaned_text = ' '.join(cleaned_text.split())
    
    # Remove special characters if needed (keeping apostrophes)
    cleaned_text = re.sub(r'[^\w\s\']', '', cleaned_text)
    
    return cleaned_text

cleaned_hadees = structured_hadees_df.copy()
cleaned_hadees['Hadees'] = cleaned_hadees['Hadees'].apply(clean_hadees_text)   

cleaned_hadees['Hadees'] = cleaned_hadees['Hadees'].str.replace('Abii','Abu')
cleaned_hadees.to_csv('Cleaned1_Shahi_bukhari.csv')


In [None]:
from textblob import TextBlob

string = []
values = cleaned_hadees.loc['Volume 1 Book 1 Number 5:'].values[0].split()
for count in range(len(values)):
    if count+1>=len(values):break
    w1 = values[count]
    w2 = values[count+1]
    word = "".join([w1,w2])
    correct = str(TextBlob(word).correct())
    if word == correct:
        string.append(correct)
    else:
        string.append(values[count]+'e')

In [None]:
string

In [None]:
from textblob import TextBlob

values = cleaned_hadees.loc['Volume 1 Book 1 Number 5:'].values[0].split()
output = []
last_added = None

for i in range(len(values)):
    # Handle the last word case
    if i + 1 >= len(values):
        if last_added != values[i]:
            output.append(values[i])
        break
    
    w1, w2 = values[i], values[i+1]
    combined_correction = str(TextBlob(w1 + " " + w2).correct())
    
    # If the combined correction is different, decide based on whether w1 is correct
    if (w1 + " " + w2) != combined_correction:
        w1_corrected = str(TextBlob(w1).correct())
        if w1 == w1_corrected:  # If w1 is correct, add it once and handle w2 in next iteration
            if last_added != w1:
                output.append(w1)
                last_added = w1
        else:
            # If w1 needs correction or combined words form a correction, add combined
            output.append(combined_correction)
            last_added = w2  # Assume w2 is part of the correction
    else:
        # If no correction needed, add w1 if it wasn't added last
        if last_added != w1:
            output.append(w1)
            last_added = w1

# Join the output list to form the corrected string
corrected_string = " ".join(output)
print(corrected_string)


In [None]:
narrators = []

for narrator in cleaned_hadees_no.values():
    v = " ".join(narrator.split(':')[0].split()[1:]) + " RA"
    v = "_".join(v.split())
    v_cleaned = v.replace("'", "")  # Replace '#' with any specific symbol you need to remove
    narrators.append(v_cleaned)
    # print(v_cleaned)
    
print('Total Narrators: ',len(set(narrators)))

In [None]:
narrator_count_df = pd.Series(narrators,name='narrator').value_counts().to_frame()#.head(20)
total_hadiths = len(cleaned_hadees_no)  # Total number of hadiths as the denominator for the ratio

# Calculate the ratio of occurrences of each narrator to the total number of Hadith
narrator_count_df['Ratio'] = (narrator_count_df['count'] / total_hadiths)*100

narrator_count_df

In [None]:
def cleaner(hadees):
    from openai import OpenAI
    client = OpenAI(api_key="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")
    # client.api_key = "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"
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "text cleaning agent where you are going to be given single hadees from sahih hadees and you are supposed to correct spell mistakes"},
        {"role": "user", "content": hadees,}
    ]
    )

    corrected_text = response.choices[0].text.strip()  # Adjust this according to the actual structure of OpenAI's response

    return corrected_text

cleaned_hadees['Hadees'].apply(cleaner)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

narrator_frequencies = narrator_count_df['count'].to_dict()


wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(narrator_frequencies)

plt.figure(figsize=(10, 5),dpi=1200)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('narrators.png',bbox_inches='tight')
plt.show()

In [None]:
import numpy as np

top_narrators = narrator_count_df['Ratio'].head(20)  # Use 'Ratio' instead of 'Count' for this visualization
colors = plt.cm.Greens_r(np.linspace(0, 1, len(top_narrators)))

ax = top_narrators.plot(kind='bar', figsize=(14, 7),color=colors, edgecolor='black')

plt.title('Top 20 Narrators by Ratio', fontsize=20)
plt.xlabel('Narrator', fontsize=14)
plt.ylabel('Ratio (%)', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', linewidth=0.7)
plt.tight_layout()

# Annotating each bar with the ratio value
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}%", (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black', rotation=0, xytext=(0, 10),
                textcoords='offset points')

plt.savefig('top_narrator_ratio.png', bbox_inches='tight')  # Save the figure as a .png file
plt.show()