In [38]:
import os
import re
import pandas as pd
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(treebank_tag):
    """
    Convert Treebank tags to WordNet POS tags.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text(text):
    """
    Normalize textual data by lowercasing, removing punctuation, and extra whitespace.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_stop_words(tokens):
    """
    Remove stop words from the list of tokens.
    """
    return [word for word in tokens if word not in stop_words]

def lemmatize(tokens):
    """
    Lemmatize the tokens.
    """
    pos_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tokens]
    return lemmatized_tokens

def is_similar(title1, title2, threshold=0.5):
    """
    Compares two titles using token-level matching.
    The similarity is computed as:
        similarity = (number of common tokens) / (max(tokens in title1, tokens in title2))
    Returns True if similarity is greater than or equal to the threshold.
    """
    tokens1 = set(lemmatize(word_tokenize(title1)))
    tokens2 = set(lemmatize(word_tokenize(title2)))
    if not tokens1 or not tokens2:
        return False
    common = tokens1.intersection(tokens2)
    similarity = len(common) / max(len(tokens1), len(tokens2))
    return similarity >= threshold

def extract_pdf_title(filename):
    """
    Extracts the title part from a text filename formatted as "KPMG_DATE_TITLE.txt".
    It splits the filename (without the extension) by '_' with a maximum of 2 splits.
    For example:
      "KPMG_2025-02-06_KPMG global tech report 2024.txt" -> "KPMG global tech report 2024"
    """
    base = filename[:-4]
    parts = base.split('_', 2)
    if len(parts) == 3:
        return parts[2].strip()
    else:
        return base

# --------------------------
# Step 1. Load the Original CSV
# --------------------------
csv_file = "insights-details-kpmg.csv"
df = pd.read_csv(csv_file)

# --------------------------
# Step 2. Build a List of PDF Text Files Info from the "txt" Folder
# --------------------------
txt_folder = "txt"
pdf_text_files = []
for file in os.listdir(txt_folder):
    if file.lower().endswith('.txt'):
        pdf_title = extract_pdf_title(file)
        file_path = os.path.join(txt_folder, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        pdf_text_files.append({
            "filename": file,
            "pdf_title": pdf_title,
            "pdf_content": content
        })

# --------------------------
# Step 3. Process Each CSV Row, Find Matching PDF Content, and Concatenate
# --------------------------
new_data = []
for idx, row in df.iterrows():
    csv_title = str(row.get("Title", "")).strip()

    matched_pdf_content = ""
    for pdf_file in pdf_text_files:
        pdf_title = pdf_file["pdf_title"]
        if is_similar(csv_title, pdf_title, threshold=0.5):
            matched_pdf_content = pdf_file["pdf_content"]
            break

    csv_content = str(row.get("Content", ""))
    concatenated = csv_content + " " + matched_pdf_content

    # Normalize concatenated text
    normalized_text = normalize_text(concatenated)
    
    # For the "tokenized_text" column: generate only 2-gram tokens
    words = word_tokenize(normalized_text)
    two_gram_tokens = ['_'.join(ngram) for ngram in ngrams(words, 2)]
    tokenized_text = ' '.join(two_gram_tokens)
    
    # For the "normalized_concatenated_text" column: only word-level tokens
    word_tokens = word_tokenize(normalized_text)
    filtered_tokens = remove_stop_words(word_tokens)
    lemmatized_tokens = lemmatize(filtered_tokens)
    lemmatized_text = ' '.join(lemmatized_tokens)

    new_row = {
        "url_link": row.get("url_link", row.get("Link", "")),
        "Title": csv_title,
        "Description": row.get("Description", ""),
        "Date": row.get("Date", ""),
        "Content": csv_content,
        "Pdf_link": row.get("Pdf_link", ""),
        "pdf_content": matched_pdf_content,
        "concatenated_text": concatenated,  # Original concatenated text
        "tokenized_text": tokenized_text,    # Contains only 2-gram tokens
        "normalized_concatenated_text": lemmatized_text  # Contains normalized, stop-word removed, and lemmatized words
    }
    new_data.append(new_row)

# --------------------------
# Step 4. Save the New Data to a New CSV File
# --------------------------
new_df = pd.DataFrame(new_data)
output_parquet = "final_concatenated_insights_gzip.parquet"

new_df.to_parquet(output_parquet, compression='gzip')

print(f"Final parquet saved as {output_parquet}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Final parquet saved as final_concatenated_insights_gzip.parquet


In [39]:
lengths = new_df['normalized_concatenated_text'].apply(len)
average_length = lengths.mean()

print("Average length of 'normalized_concatenated_text':", average_length)


lengths = new_df['tokenized_text'].apply(len)
average_length = lengths.mean()

print("Average length of 'tokenized_text':", average_length)

lengths = new_df['concatenated_text'].apply(len)
average_length = lengths.mean()

print("Average length of 'concatenated_text':", average_length)



Average length of 'normalized_concatenated_text': 41062.71428571428
Average length of 'tokenized_text': 108290.0
Average length of 'concatenated_text': 56169.57142857143


In [33]:
new_df.head()

Unnamed: 0,url_link,Title,Description,Date,Content,Pdf_link,pdf_content,concatenated_text,tokenized_text,normalized_concatenated_text
0,https://kpmg.com/in/en/insights/2025/02/aau-ac...,Issue no. 103 | February 2025,This edition of AAU covers relevant financial ...,2025-02-28 00:00:00,"Ind AS 103, Business Combination provides guid...",https://kpmg.com/content/dam/kpmgsites/in/pdf/...,February 2025\nkpmg.com/in\nAccounting and \nA...,"Ind AS 103, Business Combination provides guid...",ind as 103 business combination provides guida...,ind 103 business combination provide guidance ...
1,https://kpmg.com/in/en/insights/2025/02/food-a...,Food and Nutritional Security in India,Solutions for achieving zero hunger and ensuri...,2025-02-20 00:00:00,Food security has been a critical aspect of In...,https://kpmg.com/content/dam/kpmgsites/in/pdf/...,Food and nutritional \nsecurity in India\nSolu...,Food security has been a critical aspect of In...,food security has been a critical aspect of in...,food security critical aspect indias public po...
2,https://kpmg.com/in/en/insights/2025/02/financ...,Financial Crime Bulletin,Dive deep into the financial crime avenues and...,2025-02-10 00:00:00,Financial crimes have become an ever-evolving ...,,,Financial crimes have become an ever-evolving ...,financial crimes have become an everevolving p...,financial crime become everevolving problem me...
3,https://kpmg.com/in/en/insights/2025/02/kpmg-g...,KPMG global tech report – industrial manufactu...,"Interoperability, hybrid models and AI innovat...",2025-02-07 00:00:00,In the rapidly evolving landscape of industria...,https://kpmg.com/content/dam/kpmgsites/xx/pdf/...,KPMG global \ntech report 2024\nKPMG Internati...,In the rapidly evolving landscape of industria...,in the rapidly evolving landscape of industria...,rapidly evolve landscape industrial manufactur...
4,https://kpmg.com/in/en/insights/2025/02/kpmg-g...,KPMG global tech report: Technology insights,Tech: A bold sector that innovates while leadi...,2025-02-07 00:00:00,The digital transformation journey is an impor...,https://kpmg.com/content/dam/kpmgsites/xx/pdf/...,KPMG global \ntech report 2024\nKPMG Internati...,The digital transformation journey is an impor...,the digital transformation journey is an impor...,digital transformation journey important strat...
