In [None]:
import pandas as pd
from rapidfuzz import fuzz

# Sample dataframe with text column
data = {'text_column': [
    'India is followed by the Democratic Republic of Congo on 178th rank, Bangladesh on 179th and Burundi on 180th.',
    'Top five nations with the best EPI index are Switzerland, France, Denmark, Malta and Sweden respectively.',
    'Long and Short Paragraphs on Pollution in English',
    'Environmental pollution refers to the presence of harmful and poisonous substances into our environment...',
    'Mobile application development is the process of making software for smartphones, tablets and digital assistants...'
]}
text_df = pd.DataFrame(data)

# Sample extracted text
extracted_text = """
India is followed by the Democratic Republic of Congo on 178th rank, Bangladesh on 179th and Burundi on 180th. 
Top five nations with the best EPI index are Switzerland, France, Denmark, Malta and Sweden respectively

Long and Short Paragraphs on Pollution in English
Below we have provided both long and short paragraphs on pollution of varying word lengths...

Environmental pollution refers to the presence of harmful and poisonous substances into our environment...
Mobile application development is the process of making software for smartphones, tablets and digital assistants...
"""

# Split extracted text into paragraphs
paragraphs = extracted_text.split('\n\n')

# Create a DataFrame with paragraphs as rows
paragraph_df = pd.DataFrame({'Paragraph': paragraphs})

# Create a new DataFrame to store matching results
match_results = []

# Iterate through each row in the "text_df" DataFrame
for index, row in text_df.iterrows():
    target_text = row['text_column']
    
    # Match each paragraph with the current row's text
    match_scores = paragraph_df['Paragraph'].apply(lambda p: fuzz.token_set_ratio(target_text, p))
    
    # Find the best matching paragraph
    best_match_index = match_scores.idxmax()
    best_match_paragraph = paragraph_df.loc[best_match_index, 'Paragraph']
    best_match_score = match_scores[best_match_index]
    
    match_results.append({'Paragraph': best_match_paragraph, 'Match': target_text, 'Match Score': best_match_score})

# Create a DataFrame from match_results
match_df = pd.DataFrame(match_results)

# Print the resulting DataFrame with "Paragraph", "Match", and "Match Score" columns
print(match_df)


In [None]:
#to save the pdf into dataframe
import pandas as pd
import re

# Sample extracted text
extracted_text = """
1. Objective
1.11 India is followed by the Democratic Republic of Congo on 178th rank, Bangladesh on 179th and Burundi on 180th.
Top five nations with the best EPI index are Switzerland, France, Denmark, Malta and Sweden respectively.
1.2 Long and Short Paragraphs on Pollution in English
Below we have provided both long and short paragraphs on pollution of varying word lengths...
1.3 Environmental pollution refers to the presence of harmful and poisonous substances into our environment...
2. Application development authority
2.1 Mobile application development is the process of making software for smartphones, tablets and digital assistants...
2.3 Mobile app development is rapidly growing. From retail, telecommunications and e-commerce to insurance...
"""

# Split the extracted text into lines
lines = extracted_text.strip().split('\n')

# Regular expression patterns to match titles and descriptions
title_pattern = r'^\d+\.\d+\s+(.*)$'
description_pattern = r'^\d+\.\d+\s+(.*?)\n\d+\.\d+|\n*$'

# Initialize lists to store titles and descriptions
titles = []
descriptions = []

# Process each line to extract titles and descriptions
for line in lines:
    title_match = re.match(title_pattern, line)
    if title_match:
        titles.append(title_match.group(1))
    else:
        description_match = re.match(description_pattern, line)
        if description_match:
            descriptions.append(description_match.group(1))

# Create a DataFrame from titles and descriptions
data = {'Title': titles, 'Description': descriptions}
result_df = pd.DataFrame(data)

# Print the resulting DataFrame
print(result_df)


In [None]:
import pandas as pd

# Sample extracted text
extracted_text = """
India is followed by the Democratic Republic of Congo on 178th rank, Bangladesh on 179th and Burundi on 180th.
Top five nations with the best EPI index are Switzerland, France, Denmark, Malta and Sweden respectively.
Long and Short Paragraphs on Pollution in English
Below we have provided both long and short paragraphs on pollution of varying word lengths...
Environmental pollution refers to the presence of harmful and poisonous substances into our environment...
Mobile application development is the process of making software for smartphones, tablets and digital assistants...
"""

# Split the extracted text into sentences
sentences = extracted_text.split('.')

# Initialize lists to store titles and descriptions
titles = []
descriptions = []

# Define a threshold for sentence length to distinguish titles from descriptions
title_sentence_length = 8  # Adjust as needed

# Process each sentence to separate titles and descriptions
current_title = ""
for sentence in sentences:
    sentence = sentence.strip()
    if len(sentence.split()) <= title_sentence_length:
        if current_title:
            titles.append(current_title)
        current_title = sentence
    else:
        current_title += ". " + sentence
descriptions.append(current_title)

# Create a DataFrame from titles and descriptions
data = {'Title': titles, 'Description': descriptions}
result_df = pd.DataFrame(data)

# Print the resulting DataFrame
print(result_df)

