In [None]:
!pip install sentence_transformers
!pip install spacy
!pip install sklearn
!pip install Bert-extractive-summarizer



In [None]:
import json
import spacy
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from summarizer import Summarizer

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('distilbert-base-nli-mean-tokens')  # distilbert-base-nli-stsb-mean-tokens


def preprocessing(raw_text, min_length=20):
    text = [sent.replace('\n', ' ').strip() for sent in raw_text.split('@') if
            not sent.strip() == '']  # Separa @-urile, scoate \n-urile puse random si dupa spatiile
    text = [" ".join(sent.split()) for sent in text if
            len(sent) > min_length]  # sunt multe spatii si random intre + verificare de lungime minima
    return text


def word_filter(word, sentence):
    doc = nlp(sentence)
    token = nlp(word)[0]
    if token.pos_ == "VERB":
        return 0
    for chunk in doc.noun_chunks:
        if token.text in chunk.text.split():
            return chunk
    return word


def get_candidates(text, n_gram_range=(1, 1)):
    stop_words = "english"
    count_vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words)
    return count_vectorizer.fit([text]).get_feature_names()


def get_keywords(raw_text, num_keywords=5):
    text = " ".join(preprocessing(raw_text))
    keywords = []
    doc_embedding = model.encode([text])
    candidates = get_candidates(text)
    candidate_embeddings = model.encode(get_candidates(text))
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    for index in reversed(distances.argsort()[0]):
        filtered_word = word_filter(candidates[index], text.split('.')[0])
        if filtered_word:
            keywords.append(filtered_word)
        if len(keywords) >= num_keywords:
            break

    return keywords

def get_summary(raw_text, ratio=0.3):
    text = preprocessing(raw_text)
    result = " ".join(text)
    model = Summarizer()
    return model(result, ratio)

with open("data.json") as json_file:
  datas = json.load(json_file)

In [None]:
for index, data in enumerate(datas):
  title, raw_text = datas[index]['text'].split("@", 1)
  data['title'] = title
  del data['text']

  data['keywords'] = get_keywords(raw_text)
  strs = [word for word in data['keywords'] if isinstance(word, str)]
  strs_second = [word.text for word in data['keywords'] if not isinstance(word, str)]
  data['keywords'] = strs + strs_second

  data['summary'] = get_summary(raw_text)
  # data['summary'] = ".".join(data['summary'].split("@"))
  print(data)
  print(index)

{'link': 'https://www.db.com/society', 'ramuri': ['https://www.db.com/cr/de/gesellschaft/index.htm', 'https://www.db.com/what-we-do'], 'title': 'Society  Deutsche Bank Responsibility', 'keywords': ['startups', '2021', 'global', '2019', 'download'], 'summary': 'Tangible impact:CSR COVID-19Community relief 1.6m people through our corporate social responsibility programmes in 2019. 5,735 social enterprises with advice and support to help address issues in society via our Made for Good programme in 2019. We created Born to Be to help young people reach their full potential. We focus our efforts on the most disadvantaged through projects that deliver basic welfare, improve essential infrastructure and provide relief in emergencies. On this page we offer you CSR-related reports, brochures and general information for download. Last Update: January 15, 2021 Copyright 2021 Deutsche Bank AG, Frankfurt am Main'}
36
{'link': 'https://www.db.com/what-we-do/focus-topics', 'ramuri': ['https://www.db.