In [1]:
from pprint import pprint
from datasets import load_dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
from torch.utils.data import DataLoader
import nltk
import os
import re
import math
import operator
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rei/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/rei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
hupd_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-25',
    val_filing_start_date='2016-01-26',
    val_filing_end_date='2016-01-31',
)

In [3]:
Stopwords = set(stopwords.words('english'))
wordlemmatizer = WordNetLemmatizer()

In [6]:
dirty_claim = hupd_dict['train'][4]['claims']

In [7]:
dirty_claim

'1. A crystal growth furnace for growing a crystalline ingot comprising: a crucible containing at least feedstock material; and a liquid-cooled heat exchanger that is vertically movable beneath the crucible comprising: a heat extraction bulb made of a material having a thermal conductivity value of greater than about 200 W/(m·k); and a liquid coolant inlet tube and a liquid coolant outlet tube, wherein the liquid coolant inlet tube, the liquid coolant outlet tube, or both are attached to the heat extraction bulb for circulating a liquid coolant therethrough. 2. The crystal growth furnace of claim 1, wherein the liquid coolant comprises water. 3. The crystal growth furnace of claim 1, wherein the material has a melting point less than a melting point of the feedstock. 4. The crystal growth furnace of claim 3, wherein the melting point of the feedstock and the melting point of the material differ by not more than about 455° C. 5. The crystal growth furnace of claim 3, wherein the melting

## Sentence Tokenization V2

In [8]:
import re

def process_claims_v10(dirty_claim):
    # Split the claims on each claim number or range of claim numbers
    split_claims = re.split(r'(?<=\.\s)(\d+-*\d*\.\s)', dirty_claim)

    combined_sentences = []
    current_claim = ""

    for part in split_claims:
        if re.match(r'\d+-*\d*\.\s', part):
            # If current_claim is not empty, add it to combined_sentences
            if current_claim:
                combined_sentences.append(current_claim.strip())
            current_claim = part
        else:
            current_claim += part

    # Add the last claim
    if current_claim:
        combined_sentences.append(current_claim.strip())

    # Handle ranges of canceled claims and keep the subsequent claims
    processed_claims = []
    for claim in combined_sentences:
        # Remove the '(canceled)' phrase
        claim = re.sub(r'\(canceled\)', '', claim)

        # Remove claim numbers
        claim = re.sub(r'\b\d+-*\d*\.\s*', '', claim)

        processed_claims.append(claim)

    return processed_claims


combined_sentences_v10 = process_claims_v10(dirty_claim)

for index, sentence in enumerate(combined_sentences_v10):
    print(f"Claim {index+1}: {sentence}")


Claim 1: A crystal growth furnace for growing a crystalline ingot comprising: a crucible containing at least feedstock material; and a liquid-cooled heat exchanger that is vertically movable beneath the crucible comprising: a heat extraction bulb made of a material having a thermal conductivity value of greater than about 200 W/(m·k); and a liquid coolant inlet tube and a liquid coolant outlet tube, wherein the liquid coolant inlet tube, the liquid coolant outlet tube, or both are attached to the heat extraction bulb for circulating a liquid coolant therethrough.
Claim 2: The crystal growth furnace of claim 1, wherein the liquid coolant comprises water.
Claim 3: The crystal growth furnace of claim 1, wherein the material has a melting point less than a melting point of the feedstock.
Claim 4: The crystal growth furnace of claim 3, wherein the melting point of the feedstock and the melting point of the material differ by not more than about 455° C.
Claim 5: The crystal growth furnace of

---

## TFIDF Summarization

### <strong> [TFIDF] readymade extractive summarization

In [9]:
def lemmatize_words(words):
    lemmatized_words = []
    for word in words:
       lemmatized_words.append(wordlemmatizer.lemmatize(word))
    return lemmatized_words
def stem_words(words):
    stemmed_words = []
    for word in words:
       stemmed_words.append(stemmer.stem(word))
    return stemmed_words
def remove_special_characters(text):
    regex = r'[^a-zA-Z0-9\s]'
    text = re.sub(regex,'',text)
    return text
def freq(words):
    words = [word.lower() for word in words]
    dict_freq = {}
    words_unique = []
    for word in words:
       if word not in words_unique:
           words_unique.append(word)
    for word in words_unique:
       dict_freq[word] = words.count(word)
    return dict_freq
def pos_tagging(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
             pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb
def tf_score(word,sentence):
    freq_sum = 0
    word_frequency_in_sentence = 0
    len_sentence = len(sentence)
    for word_in_sentence in sentence.split():
        if word == word_in_sentence:
            word_frequency_in_sentence = word_frequency_in_sentence + 1
    tf =  word_frequency_in_sentence/ len_sentence
    return tf
def idf_score(no_of_sentences,word,sentences):
    no_of_sentence_containing_word = 0
    for sentence in sentences:
        sentence = remove_special_characters(str(sentence))
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.split()
        sentence = [word for word in sentence if word.lower() not in Stopwords and len(word)>1]
        sentence = [word.lower() for word in sentence]
        sentence = [wordlemmatizer.lemmatize(word) for word in sentence]
        if word in sentence:
            no_of_sentence_containing_word = no_of_sentence_containing_word + 1
    idf = math.log10(no_of_sentences/no_of_sentence_containing_word)
    return idf
def tf_idf_score(tf,idf):
    return tf*idf
def word_tfidf(dict_freq,word,sentences,sentence):
    word_tfidf = []
    tf = tf_score(word,sentence)
    idf = idf_score(len(sentences),word,sentences)
    tf_idf = tf_idf_score(tf,idf)
    return tf_idf
def sentence_importance(sentence,dict_freq,sentences):
     sentence_score = 0
     sentence = remove_special_characters(str(sentence)) 
     sentence = re.sub(r'\d+', '', sentence)
     pos_tagged_sentence = [] 
     no_of_sentences = len(sentences)
     pos_tagged_sentence = pos_tagging(sentence)
     for word in pos_tagged_sentence:
          if word.lower() not in Stopwords and word not in Stopwords and len(word)>1: 
                word = word.lower()
                word = wordlemmatizer.lemmatize(word)
                sentence_score = sentence_score + word_tfidf(dict_freq,word,sentences,sentence)
     return sentence_score

In [10]:
# tokenized_sentence = sent_tokenize(dirty_claim)
text = remove_special_characters(str(dirty_claim))
text = re.sub(r'\d+', '', dirty_claim)
tokenized_words_with_stopwords = word_tokenize(dirty_claim)
tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords]
tokenized_words = [word for word in tokenized_words if len(word) > 1]
tokenized_words = [word.lower() for word in tokenized_words]
tokenized_words = lemmatize_words(tokenized_words)
word_freq = freq(tokenized_words)
input_user = int(input('Percentage of information to retain(in percent):'))
no_of_sentences = int((input_user * len(combined_sentences_v10))/100)
print(no_of_sentences)
c = 1
sentence_with_importance = {}
for sent in combined_sentences_v10:
    sentenceimp = sentence_importance(sent,word_freq,combined_sentences_v10)
    sentence_with_importance[c] = sentenceimp
    c = c+1
sentence_with_importance = sorted(sentence_with_importance.items(), key=operator.itemgetter(1),reverse=True)
cnt = 0
summary = []
sentence_no = []
for word_prob in sentence_with_importance:
    if cnt < no_of_sentences:
        sentence_no.append(word_prob[0])
        cnt = cnt+1
    else:
      break
sentence_no.sort()
cnt = 1
for sentence in combined_sentences_v10:
    if cnt in sentence_no:
       summary.append(sentence)
    cnt = cnt+1
readymade_summary = " ".join(summary)
print("\n")
print("Summary:")
print(readymade_summary)
# outF = open('summary.txt',"w")
# outF.write(summary)

11


Summary:
A crystal growth furnace for growing a crystalline ingot comprising: a crucible containing at least feedstock material; and a liquid-cooled heat exchanger that is vertically movable beneath the crucible comprising: a heat extraction bulb made of a material having a thermal conductivity value of greater than about 200 W/(m·k); and a liquid coolant inlet tube and a liquid coolant outlet tube, wherein the liquid coolant inlet tube, the liquid coolant outlet tube, or both are attached to the heat extraction bulb for circulating a liquid coolant therethrough. The crystal growth furnace of claim 3, wherein the melting point of the feedstock and the melting point of the material differ by not more than about 455° C. The crystal growth furnace of claim 1, wherein the heat extraction bulb has an internal cavity to receive the liquid coolant from the liquid coolant inlet tube and discharge the liquid coolant through the liquid coolant outlet tube. The crystal growth furnace of clai

---

### <strong> [TF-IDF] Maths only calculation

reference from : https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3

Background

TF-IDF is made up of two algorithms:
- Term Frequency : how common a word is
    - calculation: </br>
    <code> TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document) </code>
- Inverse Document Frequency : how unique a word is
    - calculation: </br>
        <code> IDF(t) = log_e(Total number of documents / Number of documents with term t in it) </code>


### TF Calculation

<strong>Frequency Matrix Calcuation </strong> </br>
- An empty dictionary called <code> frequency_matrix</code> is created to store the frequency of the words
- <strong> Stopwords & PorterStemmer  </strong> is used to remove the stopwords and stemming the words.
- each sentence is being tokenized into word level and count the tokens and put in the freq_table to store the frequencies of the word.
- Each words are converted into lowercase and then stemmed. stopwords are being ignored in this step. For each word, this block checks if it's already in the freq_table.
- If it is, the frequency count is incremented.
- If not, the word is added to the table with a frequency count of 1.
- The frequency table for each sentence is added to the frequency_matrix with a key. The key is the length of the sentence for now

In [142]:
from nltk.stem import PorterStemmer

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:200]] = freq_table

    return frequency_matrix

freq_matrix = _create_frequency_matrix(combined_sentences_v10)
freq_matrix

{'A heat shield for shielding hot areas of a combustion engine comprising at least one metal sheet layer with a first and a second surface, with the at least one metal sheet layer comprising at least on': {'heat': 1,
  'shield': 2,
  'hot': 1,
  'area': 8,
  'combust': 1,
  'engin': 1,
  'compris': 3,
  'least': 5,
  'one': 3,
  'metal': 2,
  'sheet': 2,
  'layer': 2,
  'first': 7,
  'second': 3,
  'surfac': 3,
  ',': 13,
  'passag': 11,
  'open': 10,
  'fasten': 1,
  'element': 4,
  'sleev': 5,
  'pass': 1,
  'wherein': 1,
  'decoupl': 3,
  'flexibl': 1,
  'materi': 1,
  'arrang': 1,
  'circumferenti': 3,
  'edg': 3,
  ';': 1,
  ')': 3,
  'annular': 1,
  'shank': 4,
  'penetr': 1,
  'b': 1,
  'collar': 6,
  'extend': 4,
  'radial': 5,
  'outward': 2,
  'rel': 3,
  'c': 1,
  'adjac': 1,
  'region': 3,
  'show': 2,
  'constant': 1,
  'wall': 2,
  'thick': 2,
  'transit': 2,
  'section': 2,
  'outwardli': 1,
  'increas': 1,
  'round': 1,
  'inclin': 1,
  '.': 1},
 'The heat shield accord

<strong> TF Matrix Calcuation </strong>
- <code> tf_matrix </code> dictionary is created to store the term frequency for each word in each sentence.
- loop over the freq_matrix. sent will be the sentence, and f_table is the corresponding frequency table for that sentence.
- For each sentence, a new empty dictionary <code> tf_table </code> is created. This will store the term frequencies for words in this particular sentence.
- The function then iterates over each word and its count in the frequency table. It calculates the term frequency by dividing the count of each word by the total word count in the sentence, and stores this value in <code> tf_table. </code>
- The term frequency table <code> tf_table </code> for each sentence is then stored in the  <code> tf_matrix </code> with the sentence as the key.

In [122]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [123]:
tf_matrix = _create_tf_matrix(freq_matrix)

---

### IDF Calculation

we calculate, “how many sentences contain a word”, Let’s call it Documents per words matrix.

In [124]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

count_doc_per_words = _create_documents_per_words(freq_matrix)

In [125]:
count_doc_per_words

{'heat': 17,
 'shield': 17,
 'hot': 3,
 'area': 10,
 'combust': 3,
 'engin': 3,
 'compris': 7,
 'least': 8,
 'one': 4,
 'metal': 10,
 'sheet': 9,
 'layer': 9,
 'first': 9,
 'second': 5,
 'surfac': 4,
 ',': 17,
 'passag': 11,
 'open': 11,
 'fasten': 3,
 'element': 11,
 'sleev': 11,
 'pass': 2,
 'wherein': 17,
 'decoupl': 11,
 'flexibl': 3,
 'materi': 5,
 'arrang': 2,
 'circumferenti': 4,
 'edg': 4,
 ';': 2,
 ')': 6,
 'annular': 3,
 'shank': 7,
 'penetr': 2,
 'b': 3,
 'collar': 11,
 'extend': 6,
 'radial': 7,
 'outward': 1,
 'rel': 3,
 'c': 3,
 'adjac': 2,
 'region': 3,
 'show': 6,
 'constant': 2,
 'wall': 5,
 'thick': 6,
 'transit': 6,
 'section': 6,
 'outwardli': 2,
 'increas': 3,
 'round': 3,
 'inclin': 3,
 '.': 17,
 'accord': 14,
 'claim': 14,
 '18': 13,
 'said': 2,
 'compar': 1,
 'respect': 1,
 'locat': 1,
 'circular': 2,
 'segment': 1,
 '90': 2,
 '%': 4,
 'extens': 2,
 'distanc': 1,
 'larger': 4,
 'curvatur': 2,
 'radiu': 2,
 'inner': 3,
 'diamet': 2,
 '(': 3,
 'di12': 1,
 'di13': 

The result means: 1 appear in 9 sentence and antioxid appears in 12 sentences etc etc...

after that, calcuate idf according to the formula <code> IDF(t) = log_e(Total number of documents / Number of documents with term t in it) </code> which is the number from the above result.

In [126]:
total_docs = len(combined_sentences_v10)
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words,total_docs)

In [127]:
idf_matrix

{'A heat shield for shielding hot areas of a combustion engine comprising at least one metal sheet layer with a first and a second surface, with the at least one metal sheet layer comprising at least on': {'heat': 0.0,
  'shield': 0.0,
  'hot': 0.7533276666586115,
  'area': 0.2304489213782739,
  'combust': 0.7533276666586115,
  'engin': 0.7533276666586115,
  'compris': 0.3853508813640171,
  'least': 0.32735893438633035,
  'one': 0.6283889300503115,
  'metal': 0.2304489213782739,
  'sheet': 0.27620641193894907,
  'layer': 0.27620641193894907,
  'first': 0.27620641193894907,
  'second': 0.5314789170422551,
  'surfac': 0.6283889300503115,
  ',': 0.0,
  'passag': 0.18905623622004888,
  'open': 0.18905623622004888,
  'fasten': 0.7533276666586115,
  'element': 0.18905623622004888,
  'sleev': 0.18905623622004888,
  'pass': 0.9294189257142927,
  'wherein': 0.0,
  'decoupl': 0.18905623622004888,
  'flexibl': 0.7533276666586115,
  'materi': 0.5314789170422551,
  'arrang': 0.9294189257142927,
  '

### Calcualte TFIDF

In [128]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}
        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):
            # Reduce the weight for numerals
            if word1.isdigit():
                tf_idf_table[word1] = 0
            else:
                tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix


tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

In [129]:
tf_idf_matrix

{'A heat shield for shielding hot areas of a combustion engine comprising at least one metal sheet layer with a first and a second surface, with the at least one metal sheet layer comprising at least on': {'heat': 0.0,
  'shield': 0.0,
  'hot': 0.013950512345529843,
  'area': 0.034140580944929465,
  'combust': 0.013950512345529843,
  'engin': 0.013950512345529843,
  'compris': 0.021408382298000947,
  'least': 0.030311012443178735,
  'one': 0.03491049611390619,
  'metal': 0.008535145236232366,
  'sheet': 0.010229867108849966,
  'layer': 0.010229867108849966,
  'first': 0.035804534880974875,
  'second': 0.029526606502347508,
  'surfac': 0.03491049611390619,
  ',': 0.0,
  'passag': 0.038511455526306256,
  'open': 0.03501041411482386,
  'fasten': 0.013950512345529843,
  'element': 0.014004165645929547,
  'sleev': 0.01750520705741193,
  'pass': 0.017211461587301715,
  'wherein': 0.0,
  'decoupl': 0.01050312423444716,
  'flexibl': 0.013950512345529843,
  'materi': 0.009842202167449169,
  'ar

### Score the sentence

In [133]:
def _score_sentences(tf_idf_matrix) -> dict:
    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0
        count_words_in_sentence = len(f_table)

        for word, score in f_table.items():
            # Reduce the score for repetitive phrases
            if 'claim' in word:  # Example criterion, adjust as needed
                score *= 0.5  # Reduce the weight of these words

            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue


sentenceValue= _score_sentences(tf_idf_matrix)

In [134]:
sentenceValue

{'A heat shield for shielding hot areas of a combustion engine comprising at least one metal sheet layer with a first and a second surface, with the at least one metal sheet layer comprising at least on': 0.019980920641194323,
 'The heat shield according to claim 18, wherein at least one of said sections of said transition area shows an increased radial wall thickness, each of these transition areas shows an increased radial ': 0.02655777681294699,
 'The heat shield according to claim 18, wherein the transition area between the shank area and the first collar of the sleeve on at least 90% of its circumferential extension shows a distance to the me': 0.01199059883084724,
 'The heat shield according to claim 18, wherein the transition area between the shank area and the first collar of the sleeve on at least 90% of its radial extension shows a radial curvature radius whi': 0.013816095520184346,
 'The heat shield according to claim 18, wherein the decoupling element in the area of its sec

In [135]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average
threshold = _find_average_score(sentenceValue)

In [136]:
threshold

0.020853000947830408

In [137]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:len(sentence)] in sentenceValue and sentenceValue[sentence[:len(sentence)]] >= (threshold):
            # Additional check to avoid sentences with only numerals or very short content
            if len(sentence.split()) > 5 and not sentence.strip().isdigit():
                summary += " " + sentence
                sentence_count += 1

    return summary


tfidf_summary = _generate_summary(combined_sentences_v10, sentenceValue, threshold)

### Generate the summary

In [138]:
tfidf_summary

' The heat shield according to claim 18, wherein the decoupling element is composed of metal wire mesh or silicone. The heat shield according to claim 18, wherein the first collar of the decoupling element comprises compressed sections, where different compressed sections may have a different degree of compression. The heat shield according to claim 18, wherein the wall of the sleeve shows no recesses on its outer or inner circumference.'

---

### abstractive with chargpt

In [14]:
import openai
# openai.api_key = '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' #Rei's
openai.api_key = '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' #raja's

In [15]:
text_to_summarize = '\n'.join(combined_sentences_v10)
request_text = f"Summarize the following text:\n{text_to_summarize}"

In [16]:
response = openai.Completion.create(
  model="text-davinci-003",
  prompt=request_text,
  temperature=0.7,
  max_tokens=150
)

# Extract the summary from the response
summary = response.choices[0].text.strip()
print("Summary:", summary)

RateLimitError: You exceeded your current quota, please check your plan and billing details.