In [27]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.document_loaders import UnstructuredURLLoader
from collections import defaultdict
import os
import re

In [28]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

In [29]:
classifier = pipeline("text-classification", model="dima806/news-category-classifier-distilbert")

tags=['POLITICS', 'WORLD NEWS', 'WORLDPOST']

In [30]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer,grouped_entities=True)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
similarity = SentenceTransformer('davanstrien/headline-similarity')

In [32]:
summarizer = pipeline("summarization", model="JordiAb/BART_news_summarizer")

In [33]:
text=["BJPs Suvendu Adhikari claimed Shajahan Sheikh managed to negotiate a deal with the Mamata Banerjee Police. West Bengal BJP leader Suvendu Adhikari on Wednesday claimed that the absconding Trinamool Congress (TMC) strongman Shajahan Sheikh, the main accused of the Sandeshkhali violence over land grab and sexual harassment, is in safe custody of the state police since Tuesday night. Suvendu said the TMC strongman managed to negotiate a deal with chief minister Mamata Banerjee's police and has been extended five-star facilities during his time in custody. The Scoundrel of Sandeshkhali - Seikh Shahjahan is in the safe custody of Mamata Police since 12 am last night. He was taken away from the Bermajur - II Gram Panchayat area, after he managed to negotiate a deal with the Mamata Police, through influential mediators, that he would be taken care of properly while in Police and Judicial Custody. He will be extended 5-star facilities during his time behind the bars and will have access to a mobile phone, through which he will be able to lead the Tolamool Party virtually. Even a bed in the Woodburn Ward will be kept ready and vacant for him if he chooses to spend some time there, Adhikari said in a post on X (formerly Twitter). TMC leader Santanu Sen dubbed Adhikari's allegations as baseless. He said that Adhikari is making such brazen attempts to disturb the law and order situation in West Bengal.","Controversial Trinamool Congress leader Sheikh Shahjahan was arrested by the West Bengal Police on February 28 night over an attack on officials of the Enforcement Directorate last month. The Trinamool leader, against whom multiple allegations of land grab and sexual assault have been levelled by villagers at Sandeshkhali, was sent to 10 days in police custody by a court in West Bengals North 24 Parganas district on February 29. Hours after the arrest, the Trinamool suspended the party strongman from North 24 Parganas for six years. Sandeshkhali has been on the boil since the first week of February after villagers staged violent protests seeking action against Mr. Shahjahan and other local Trinamool leaders. While Trinamool leaders Shiboprasad Hazra and Uttam Sardar, who were facing sexual assault charges, were arrested by the police, Mr. Shahajahan had evaded security agencies till now.","Speaking in the Delhi Assembly on the issue of removal of civil defence volunteers deployed as bus marshals in Delhi Transport Corporation (DTC) and cluster buses, Kejriwal said the scheme ran smoothly from 2015 to 2022. We formed the government in 2015 with a promise to ensure women's safety. We installed CCTV cameras in five years, street lights were installed at dark spots and buses were equipped with CCTV cameras, panic buttons and bus marshals were deployed. There are several instances of good work by bus marshals, the chief minister said. The bus marshal scheme worked well for eight years but in 2023, officers started raising objections saying civil defence volunteers could not work as marshals, he said. The Lt Governor threatened officers to stall the bus marshal scheme. The L-G questioned the deployment of marshals saying there were CCTV cameras and panic buttons Kejriwal charged. He also accused the BJP of shedding crocodile tears over the issue and said that he was ready to sign any paper for the reinstatement of bus marshals.Kejriwal also spoke about Saxena's open letter to him on the one-time settlement scheme for allegedly inflated water bills.The language of the L-G's letter was filthy, he said, adding such language should not be used."]

In [34]:
for i in text:
    if classifier(i)[0]['label'] not in tags:
        text.remove(i)

In [35]:
imp_words=[]
imp_sentences=[]

for i in text:
    for j in nlp(i):
        if(j['word'][0]=='#'):
            j['word'] = re.sub('^#+', '', j['word'])
            temp=imp_words[-1]
            del imp_words[-1]
            temp+=j['word']
            if temp not in imp_words:
                imp_words.append(temp)
            continue
            
        if j['word'] not in imp_words:
            imp_words.append(j['word'])

    sentence = ' '.join(imp_words)
    imp_sentences.append(sentence)
    imp_words=[]
    

In [36]:
print(imp_sentences)

['BJP Suvendu Adhikari Shajahan Sheikh Mamata Banerjee Police West Bengal Trinamool Congress TMC Sandeshkhali Suvendu Mamata Banerjee Seikh Shahjah Mamata Police Bermajur II Gram Panchayat Police and Judicial Custody Tolamool Party Woodburn Ward Adhikari X Twitter Santanu Sen', 'Trinamool Congress Shahjah West Bengal Police Enforcement Directorate Trinamool Sandeshkhali West Bengals North 24 Parganas North 24 Parganas Shiboprasad Hazra Uttam Sarda Shahaja', 'Delhi Assembly Delhi Transport Corporation DTC Keriwal L - G BJP Kejriwal Saxen']


In [37]:
embeddings=similarity.encode(imp_sentences)

similar_sentences = defaultdict(list)
threshold = 0.6

# Find similar sentences
for i in range(len(embeddings)):
    for j in range(i + 1, len(embeddings)):
        similarity_score = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
        if similarity_score > threshold:
            similar_sentences[i].append(j)
            similar_sentences[j].append(i)

# Find connected components
visited = set()
groups = []

def dfs(node, group):
    visited.add(node)
    group.append(node)
    for neighbor in similar_sentences[node]:
        if neighbor not in visited:
            dfs(neighbor, group)

for i in range(len(embeddings)):
    if i not in visited:
        group = []
        dfs(i, group)
        groups.append(group)

# Print the merged groups
print(groups)

for group in groups:
    if len(group) > 1:
        first_element_index = group[0]
        first_element = text[first_element_index]
        for element_index in group[1:]:
            # Merge first element with each other element
            # For demonstration, we'll just concatenate them
            merged_element = first_element + text[element_index]
            # Replace the original element with the merged one
            text[first_element_index] = merged_element
        # Remove the non-merged elements
        text[group[0] + 1:group[-1] + 1] = []
            

[[0, 1], [2]]


In [40]:
for i in text:
    print(summarizer(i)[0]['summary_text'])
    print('\n\n\n')

West Bengal BJP leader Suvendu Adhikari claimed that Trinamool Congress (TMC) strongman Shajahan Sheikh, the main accused of the Sandeshkhali violence over land grab and sexual harassment, is in safe custody of the state police since Tuesday night. Sheikh managed to negotiate a deal with chief minister Mamata Banerjee's police and has been extended five-star facilities during his time in custody.




Speaking in the Delhi Assembly, Kejriwal discussed the removal of civil defence volunteers deployed as bus marshals in Delhi Transport Corporation (DTC) and cluster buses. He stated that the scheme ran smoothly from 2015 to 2022 and that the government formed in 2015 with a promise to ensure women's safety. The bus marshal scheme worked well for eight years, but in 2023, officers started raising objections, claiming civil defense volunteers could not work as marshals.




