In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


In [9]:
import pandas as pd
df=pd.read_csv('../../results/bertopic/results.csv')
df['Question'] = df['Question'].astype(str).apply(lambda x: x.lower())

In [11]:
df.head()

Unnamed: 0,Question,doclen,Topic,Topic_Label
0,"like everyone else (here in u.s), i work with ...",125,-1,General
1,hello dear's people. i have a fictief research...,80,6,Gun Safety Regulations
2,a lady buys goods worth 200 bucks from a shop ...,77,2,Drug Testing and Probability
3,i am turning 25 in about a month and am curren...,73,1,Job Prospects by Major
4,i'm a 34 years old married to a woman. i had a...,73,4,Dating and Communication


In [12]:
topic_id_name_map = df[['Topic', 'Topic_Label']].drop_duplicates().set_index('Topic')['Topic_Label'].to_dict()
print(topic_id_name_map)

{-1: 'General', 6: 'Gun Safety Regulations', 2: 'Drug Testing and Probability', 1: 'Job Prospects by Major', 4: 'Dating and Communication', 3: 'Universe and Energy', 5: 'Snapchat Account Recovery', 0: 'Cambodia Earthquake and Historical Battles'}


In [13]:
grouped = df.groupby('Topic')

In [14]:
for topic_id, topic_name in topic_id_name_map.items():
    topic_df = grouped.get_group(topic_id).drop(columns='Topic')
    print(f'Topic: {topic_name}')
    print(topic_df.head())
    print('\n\n')

Topic: General
                                             Question  doclen Topic_Label
0   like everyone else (here in u.s), i work with ...     125     General
5   my employer has told me that we can not accept...      72     General
10  i feel like i'm getting over my crush. this fe...      67     General
16  i belong to obc. my ctc is 7 lacs per annum bu...      64     General
17  i have 2.6 years of it experience. i got marri...      64     General



Topic: Gun Safety Regulations
                                              Question  doclen  \
1    hello dear's people. i have a fictief research...      80   
26   i am supposed to wear contacts but i haven't f...      62   
299  if two automatic weapons ( uzi ) were pointed ...      47   
387  i am trying to design a professional poster an...      45   
588  if local detectives show up at your door and s...      42   

                Topic_Label  
1    Gun Safety Regulations  
26   Gun Safety Regulations  
299  Gun Safety Regul

In [None]:
import re
def chunk_text(text, max_length=1024):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence=re.sub(r'[^\w\s]','',sentence)
        if len(current_chunk) + len(sentence) > max_length:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
            
    if current_chunk:
        chunks.append(current_chunk)
        
    return chunks

In [57]:
with open('topic_summary.txt', 'w',encoding="utf-8") as f:
    f.write('Topic Summary\n\n')
    for key,value in topic_id_name_map.items():
        f.write(f'Topic: {value}\n')
        topic_df = grouped.get_group(key).drop(columns='Topic')
        f.write(f'Total number of questions: {topic_df.shape[0]}\n')
        documents ="." .join(topic_df['Question'].values)
        chunks = chunk_text(documents, max_length=4096)
        print("Number of chunks:", len(chunks))
        summaries = []
        for idx,chunk in enumerate(chunks):
            print("key=",key,"value=",value,"idx=",idx)
            print("chunk:", len(chunk))
            summary=summarizer(chunk, max_new_tokens=200, min_length=25, do_sample=False)[0]['summary_text']
            summaries.append(summary)
            print("Summary:", len(summary))

        full_summary = " ".join(summaries)
        print("Summary:", full_summary)
        f.write("Summary: " + "\n ".join(full_summary) + '\n\n')



Number of chunks: 488
key= -1 value= General idx= 0
chunk: 3987
Summary: 485
key= -1 value= General idx= 1
chunk: 4064
Summary: 498
key= -1 value= General idx= 2
chunk: 3897
Summary: 404
key= -1 value= General idx= 3
chunk: 4093
Summary: 424
key= -1 value= General idx= 4
chunk: 4016
Summary: 343
key= -1 value= General idx= 5
chunk: 3959
Summary: 665
key= -1 value= General idx= 6
chunk: 3522
Summary: 301
key= -1 value= General idx= 7
chunk: 3922
Summary: 314
key= -1 value= General idx= 8
chunk: 4084
Summary: 865
key= -1 value= General idx= 9
chunk: 4014
Summary: 691
key= -1 value= General idx= 10
chunk: 3922
Summary: 479
key= -1 value= General idx= 11
chunk: 3982
Summary: 630
key= -1 value= General idx= 12
chunk: 4077
Summary: 517
key= -1 value= General idx= 13
chunk: 4045
Summary: 803
key= -1 value= General idx= 14
chunk: 4081
Summary: 461
key= -1 value= General idx= 15
chunk: 4047
Summary: 575
key= -1 value= General idx= 16
chunk: 3586
Summary: 461
key= -1 value= General idx= 17
chunk

KeyboardInterrupt: 

In [None]:



print("Full Summary:", full_summary)

Number of chunks: 499
idx= 0
chunk: 3987
idx= 1
chunk: 3997
idx= 2
chunk: 3964
idx= 3
chunk: 3917


KeyboardInterrupt: 

In [41]:
chunk=chunks[57]
chunk=re.sub(r'[^\w\s]','',chunk)
summary=summarizer(chunk, max_new_tokens=200, min_length=25, do_sample=False)[0]['summary_text']
