In [105]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import json
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [144]:
# Mounting the drive to fetch data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [106]:
# Loading some amount of data 
lst = []
c=0
for row in open("/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json",'r'):
  c+=1
  data = json.loads(row)
  lst.append(data)
  if c==5000:
    break

In [110]:
# Converting the input data to a Dataframe
df = pd.DataFrame(lst)
paper=df
paper.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [111]:
# Converting to lower case and combining title and abstract which will be used for the recommendation
paper=paper[['id','authors','title','abstract']]
paper.duplicated().sum()

paper['title'] = paper['title'].str.lower()
paper['abstract'] = paper['abstract'].str.lower()

paper['tags'] =paper['title']+paper['abstract']
paper.head(5)

Unnamed: 0,id,authors,title,abstract,tags
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",calculation of prompt diphoton production cros...,a fully differential calculation in perturba...,calculation of prompt diphoton production cros...
1,704.0002,Ileana Streinu and Louis Theran,sparsity-certifying graph decompositions,"we describe a new algorithm, the $(k,\ell)$-...",sparsity-certifying graph decompositions we d...
2,704.0003,Hongjun Pan,the evolution of the earth-moon system based o...,the evolution of earth-moon system is descri...,the evolution of the earth-moon system based o...
3,704.0004,David Callan,a determinant of stirling cycle numbers counts...,we show that a determinant of stirling cycle...,a determinant of stirling cycle numbers counts...
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...,in this paper we show how to compute the $\l...,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...


In [112]:
paper= paper[paper['tags'].notnull()]

# Splitting the tags
def convert(text):
    L = []
    st=""
    counter = 0
    s=text.split(sep=None, maxsplit=-1)
    for i in s:
         if i[-1]==',' or i[-1]==';' or i[-1]=='.':
            i=i[:-1]
    
         L.append(s)
    for i in L:
      for j in i:
        st+=j+" "
    return st
paper['tags'] = paper['tags'].apply(convert) 
paper.head(5)

Unnamed: 0,id,authors,title,abstract,tags
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",calculation of prompt diphoton production cros...,a fully differential calculation in perturba...,calculation of prompt diphoton production cros...
1,704.0002,Ileana Streinu and Louis Theran,sparsity-certifying graph decompositions,"we describe a new algorithm, the $(k,\ell)$-...",sparsity-certifying graph decompositions we de...
2,704.0003,Hongjun Pan,the evolution of the earth-moon system based o...,the evolution of earth-moon system is descri...,the evolution of the earth-moon system based o...
3,704.0004,David Callan,a determinant of stirling cycle numbers counts...,we show that a determinant of stirling cycle...,a determinant of stirling cycle numbers counts...
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...,in this paper we show how to compute the $\l...,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...


In [113]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    y = []
    for i in text.split():
      
        y.append(lemmatizer.lemmatize(i))

    return " ".join(y)
tqdm.pandas(desc="Lemmatizing...")
paper['tags']=paper['tags'].progress_apply(lemmatize)
paper.head()

Lemmatizing...: 100%|██████████| 5000/5000 [08:10<00:00, 10.20it/s]


Unnamed: 0,id,authors,title,abstract,tags
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",calculation of prompt diphoton production cros...,a fully differential calculation in perturba...,calculation of prompt diphoton production cros...
1,704.0002,Ileana Streinu and Louis Theran,sparsity-certifying graph decompositions,"we describe a new algorithm, the $(k,\ell)$-...",sparsity-certifying graph decomposition we des...
2,704.0003,Hongjun Pan,the evolution of the earth-moon system based o...,the evolution of earth-moon system is descri...,the evolution of the earth-moon system based o...
3,704.0004,David Callan,a determinant of stirling cycle numbers counts...,we show that a determinant of stirling cycle...,a determinant of stirling cycle number count u...
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...,in this paper we show how to compute the $\l...,from dyadic $\lambda_{\alpha}$ to $\lambda_{\a...


In [114]:
cv=CountVectorizer(max_features=5000,stop_words='english')
vectors_tag=cv.fit_transform(paper['tags']).toarray()
vectors_tag

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [115]:
print("Enter the query")
query=input()

Enter the query
mimo detection


In [116]:
# Pre-processing the query
lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(query)
lemmatized_query = [lemmatizer.lemmatize(token) for token in tokens]
lemmatized_sentence = " ".join(lemmatized_query)
# print(lemmatized_sentence)

In [153]:
# Vectorizing the query
query_vector = cv.transform([lemmatized_sentence]).toarray()
# print(query_vector)

In [154]:
cos_similarities = cosine_similarity(query_vector, vectors_tag)

no_of_results = 5
similar_indices = cos_similarities.argsort()[0][-no_of_results:]

a=0
l_id=[]
l_abstract=[]
l_title=[]
l_url=[]
l_tags=[]
for i in similar_indices:
    id=paper['id'][i]
    l_id.append(paper['id'][i])
    l_title.append(paper['title'][i])
    l_abstract.append(paper['abstract'][i])
    l_tags.append(paper['tags'][i])
    s="https://arxiv.org/pdf/"+paper['id'][i]
    l_url.append(s)

In [155]:
print("Query is : "+query)
print()
print("The recommended papers are: ")
print()
from tabulate import tabulate
x=[]

l_title=l_title[::-1]
l_url=l_url[::-1]
l_id=l_id[::-1]
l_abstract=l_abstract[::-1]
l_tags=l_tags[::-1]

for i in range(len(l_title)):
  l=[]
  l.append(l_title[i])

  
  l.append(l_url[i])
  x.append(l)


head=["Title","URL"]
print(tabulate(x, headers=head, tablefmt="grid"))


Query is : mimo detection

The recommended papers are: 

+---------------------------------------------------------------------+---------------------------------+
| Title                                                               | URL                             |
| mimo detection employing markov chain monte carlo                   | https://arxiv.org/pdf/0705.0742 |
+---------------------------------------------------------------------+---------------------------------+
| rate bounds for mimo relay channels                                 | https://arxiv.org/pdf/0705.0564 |
+---------------------------------------------------------------------+---------------------------------+
| joint detection and identification of an unobservable change in the | https://arxiv.org/pdf/0705.0043 |
|   distribution of a random sequence                                 |                                 |
+---------------------------------------------------------------------+------------------------

In [120]:
#EVALUATION USING 1 gram

from nltk.translate.bleu_score import sentence_bleu
print("-----------------EVALATION USING BLUE SCORE -----------------------------")
for i in similar_indices:
  title = paper['title'][i]
  score = sentence_bleu([lemmatized_query], title.split(), weights=(1, 0, 0, 0))
  print("--------------------------------------------------")
  print("{}: {:.6f}".format(title, score))

-----------------EVALATION USING BLUE SCORE -----------------------------
--------------------------------------------------
glrt-optimal noncoherent lattice decoding: 0.000000
--------------------------------------------------
sample size cognizant detection of signals in white noise: 0.111111
--------------------------------------------------
joint detection and identification of an unobservable change in the
  distribution of a random sequence: 0.066667
--------------------------------------------------
rate bounds for mimo relay channels: 0.166667
--------------------------------------------------
mimo detection employing markov chain monte carlo: 0.285714


# Text summarization

In [16]:
pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [47]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [121]:
import re
import os
from PyPDF2 import PdfReader
from transformers import pipeline 
import pandas as pd
import shutil
import spacy
import textract
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import requests
import en_core_web_sm
import warnings
warnings.filterwarnings('ignore')
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [122]:
def make_folder():
  shutil.rmtree('/content/download_data/', ignore_errors=True)
  os.mkdir("/content/download_data/")

In [123]:
# Downloading the file chosen by the user
def download_pdf_file(url: str) -> bool:
  response = requests.get(url, stream=True)
  pdf_file_name = os.path.basename(url)
  if response.status_code == 200:
    filepath = os.path.join("/content/download_data", pdf_file_name)
    with open(filepath, 'wb') as pdf_object:
      pdf_object.write(response.content)
      print(f'{pdf_file_name} was successfully saved!')
      return True
  else:
    print(f'Error in downloading {pdf_file_name},')
    print(f'HTTP response status code: {response.status_code}')
    return False

# if __name__ == '__main__':
#   URL = 'https://arxiv.org/pdf/0704.0647.pdf'
#   download_pdf_file(URL)

In [124]:
final = []
files = os.listdir('/content/download_data/')
c=0
for file in files:

  # Open the PDF file
  # pdf_file = open('0705.1309.pdf', 'rb')
  pdf_reader = PdfReader('/content/download_data/'+file)
  text = ""
  num_pages = len(pdf_reader.pages)
  # print(num_pages)
  for i in range(num_pages):
    page = pdf_reader.pages[i]
    text += page.extract_text()

  # Create an XML document
  root = ET.Element('document')
  for line in text.split('\n'):
    if line.strip() != '':
      ET.SubElement(root, 'line').text = line

  xml_file = open('example.xml', 'wb')
  s=ET.tostring(root)
  xml_file.write(ET.tostring(root))
  xml_file.close()
      

In [125]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

# Function to extract text from a file
def extract_text(filepath):
    # Extract text from PDF or Word document
    if filepath.endswith(".pdf") or filepath.endswith(".doc") or filepath.endswith(".docx"):
        text = textract.process(filepath).decode("utf-8")
    # Extract text from HTML document
    # elif filepath.endswith(".html") or filepath.endswith(".htm"):
    #     html = requests.get(filepath).content
    #     soup = BeautifulSoup(html, "html.parser")
    #     text = soup.get_text()
    # # Extract text from plain text file
    # else:
    #     with open(filepath) as f:
    #         text = f.read()
    # print(text)
    return text

In [126]:
def find_abstract(text):
  for j in range(len(text)):
    if "Abstract" in text[j] or "ABSTRACT" in text[j]:
      return j
  

In [127]:
def check_section_heading_length(index_abstract,matches):
  out1=[]
  for i in range(index_abstract,len(matches)):
  
    s=matches[i].split()
    #print(s)
    if len(s)<4:
      # print("length",end="  ")
      # print(len(s),matches[i])
      out1.append(matches[i])
  return out1

In [128]:
def check_first_lower(out1):
  out2=[]
  for i in range(len(out1)):
    if out1[i][0].islower()==False:
        # print("lower",end="  ")
        # print(len(s),i)
        #print(f[i])
        out2.append(out1[i])
  return out2

In [129]:
def isfloat(num):
  try:
    float(num)
    return True
  except ValueError:
    return False

In [130]:
def is_numeric(out2):
  out3=[]
  for i in range(len(out2)):
    if out2[i].isdigit()==False and isfloat(out2[i])==False:
      #print(i)
      out3.append(out2[i])
  return out3

In [131]:
def check_length(out3):
    out4=[]
    for i in range(len(out3)):
      if len(out3[i])>3:
        #print(i)
        out4.append(out3[i])
    return out4

In [132]:
def special_char_removal(out4):
  out5=[]
  for i in range(len(out4)):
    s = re.sub('[^.a-zA-Z0-9 \n\.]', '', out4[i])
    if len(out4[i])==len(s):
      out5.append(out4[i])
  return out5

In [133]:
def extract():
  final = []
  files = os.listdir('/content/download_data/')
  c=0
  for file in files:

    # Open the PDF file
    # pdf_file = open('0705.1309.pdf', 'rb')
    pdf_reader = PdfReader('/content/download_data/'+file)
    text = ""
    num_pages = len(pdf_reader.pages)
    # print(num_pages)
    for i in range(num_pages):
      page = pdf_reader.pages[i]
      text += page.extract_text()

    # Create an XML document
    root = ET.Element('document')
    for line in text.split('\n'):
      if line.strip() != '':
        ET.SubElement(root, 'line').text = line

    xml_file = open('example.xml', 'wb')
    s=ET.tostring(root)
    xml_file.write(ET.tostring(root))
    xml_file.close()
    return s

In [134]:
def extractive_summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    wf={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in wf.keys():
                    wf[word.text] = 1
                else:
                    wf[word.text] += 1
    f=max(wf.values())
    #normalize
    for word in wf.keys():
        wf[word]=wf[word]/f
    s= [sent for sent in doc.sents]
    score = {}
    for sent in s:
        for word in sent:
            if word.text.lower() in wf.keys():
                if sent not in score.keys():                            
                    score[sent]=wf[word.text.lower()]
                else:
                    score[sent]+=wf[word.text.lower()]
    select_length=int(len(s)*per)
    summary=nlargest(select_length, score,key=score.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [136]:
def extract_section_text(s,section_names):
  full_summary=""
  for i in range(len(section_names)-1):
    
    f1=s.find(section_names[i])
    f2=s.find(section_names[i+1])
    if f1==-1 or f2==-1 or f2<=f1:
      continue
    # print(f1,f2)
    # print(f1,f2)
    r=s[f1+1:f2]
    l=r.split("</line><line>")
    sec=' '.join(l)
    # sec[len(section_names[i])+1:]
    part_summary = extractive_summarize(sec,0.3)
    # print(part_summary)
    full_summary = full_summary + " " + part_summary
  return full_summary

In [137]:
l_summaries=[]
for i in l_id:
  try:
    make_folder()
    URL = 'https://arxiv.org/pdf/'+i+'.pdf'
    download_pdf_file(URL)
    filepath='/content/download_data/'+i+'.pdf'
    text=extract_text(filepath)
    pattern = r"\n(.+)\n"
    matches = re.findall(pattern, text)
    index_abstract=find_abstract(matches)

    out1=check_section_heading_length(index_abstract,matches)
    # print("out1  ",out1)
    # print("________________________________")

    out2=check_first_lower(out1)
    # print("out2  ",out2)
    # print("________________________________")

    out3=is_numeric(out2)
    # print("out3  ",out3)
    # print("________________________________")

    out4=check_length(out3)
    # print("out4  ",out4)
    # print("________________________________")

    out5=special_char_removal(out4)
    # print("out5  ",out5)
    # print("________________________________")
    
    s=extract()
    s=str(s)
    # print(l)
    section_names=out5
    # print(len(s),len(section_names))
    full_summary = extract_section_text(s,section_names)
    # print(full_summary)
    l_summaries.append(full_summary)
  except:
    l_summaries.append(' ')
    continue

0705.0742.pdf was successfully saved!
0705.0564.pdf was successfully saved!
0705.0043.pdf was successfully saved!
0704.3287.pdf was successfully saved!
0704.1524.pdf was successfully saved!


In [156]:
l_id

['0705.0742', '0705.0564', '0705.0043', '0704.3287', '0704.1524']

In [157]:
summary_df=pd.DataFrame()
summary_df['id']=l_id
summary_df['title']=l_title
summary_df['abstract']=l_abstract
summary_df['tags']=l_tags
summary_df['summary']=l_summaries
# summary_df.head()

In [158]:
tqdm.pandas(desc="Lemmatizing...")
summary_df['summary']=summary_df['summary'].progress_apply(lemmatize)

Lemmatizing...: 100%|██████████| 5/5 [00:00<00:00, 57.00it/s]


In [162]:
summary_df['tags2'] =summary_df['tags']+summary_df['summary']

In [167]:
summary_df.head()

Unnamed: 0,id,title,abstract,tags,summary,tags2
0,705.0742,mimo detection employing markov chain monte carlo,we propose a soft-output detection scheme fo...,mimo detection employing markov chain monte ca...,,mimo detection employing markov chain monte ca...
1,705.0564,rate bounds for mimo relay channels,this paper considers the multi-input multi-o...,rate bound for mimo relay channel this paper c...,Compared to single-input single-output ( SISO)...,rate bound for mimo relay channel this paper c...
2,705.0043,joint detection and identification of an unobs...,this paper examines the joint problem of det...,joint detection and identification of an unobs...,While raising an alarm a soon a the change occ...,joint detection and identification of an unobs...
3,704.3287,sample size cognizant detection of signals in ...,the detection and estimation of signals in n...,sample size cognizant detection of signal in w...,"Wepresentacomputationallysim- ple, sample eige...",sample size cognizant detection of signal in w...
4,704.1524,glrt-optimal noncoherent lattice decoding,this paper presents new low-complexity latti...,glrt-optimal noncoherent lattice decoding this...,bstract This paper present new low-complexity ...,glrt-optimal noncoherent lattice decoding this...


In [174]:
cv2=CountVectorizer(max_features=300,stop_words='english')
vectors_tag2=cv2.fit_transform(summary_df['tags2']).toarray()

In [175]:
query_vector2 = cv2.transform([lemmatized_sentence]).toarray()

In [179]:
cos_similarities = cosine_similarity(query_vector2, vectors_tag2)

no_of_results = 5
similar_indices = cos_similarities.argsort()[0][-no_of_results:]

a=0
l_id2=[]
l_abstract2=[]
l_title2=[]
l_url2=[]
l_tags2=[]
for i in similar_indices:
    id=summary_df['id'][i]
    l_id2.append(summary_df['id'][i])
    l_title2.append(summary_df['title'][i])
    l_abstract2.append(summary_df['abstract'][i])
    l_tags2.append(summary_df['tags2'][i])
    s="https://arxiv.org/pdf/"+summary_df['id'][i]
    l_url2.append(s)

In [180]:
print("Query is : "+query)
print()
print("The recommended papers are: ")
print()
from tabulate import tabulate
x=[]


for i in range(len(l_title)):
  l=[]
  l.append(l_title2[i])
  l.append(l_url2[i])
  x.append(l)


head=["Title","URL"]
print(tabulate(x, headers=head, tablefmt="grid"))


Query is : mimo detection

The recommended papers are: 

+---------------------------------------------------------------------+---------------------------------+
| Title                                                               | URL                             |
| glrt-optimal noncoherent lattice decoding                           | https://arxiv.org/pdf/0704.1524 |
+---------------------------------------------------------------------+---------------------------------+
| rate bounds for mimo relay channels                                 | https://arxiv.org/pdf/0705.0564 |
+---------------------------------------------------------------------+---------------------------------+
| sample size cognizant detection of signals in white noise           | https://arxiv.org/pdf/0704.3287 |
+---------------------------------------------------------------------+---------------------------------+
| joint detection and identification of an unobservable change in the | https://arxiv.org/pdf/0

In [182]:
print("Please enter the index of paper you want to read")
n=int(input())

Please enter the index of paper you want to read
5


In [34]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [35]:
from transformers import pipeline 

In [184]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# print(l_summaries[int(n)-1])
# summary_of_chosen=l_summaries[n-1].split()
# length_of_summary=len(summary_of_chosen)
# print(summarizer(l_summaries[4][1000:], min_length=200, max_length=400,do_sample=False))
ss=l_summaries[4]

x=(summarizer(ss[:1020], min_length=400, max_length=800,do_sample=False))

Your max_length is set to 800, but you input_length is only 238. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=119)


In [185]:
x[0]['summary_text']

'The computational complexit y ispolynomial in the block length; making GLRT-optimal noncoherent detection feasible for implementation. For the case of the constant envelope PSK constellation, a de tection algorithm with complexity O(TlogT) was developed in [18,19] (where Tis the blocklength) We then consider the more practical case of M-ary QAM detection over complex-valued fading channels, an d show that in this case the GLRT\xa0optimal\xa0codeword estimate is the closest codeword in angle to a planedescribed by the received vector. We conclude by showing that standard modulation techniques such as quadrature amplitude modulation (QAM) can be used to detect PAM signals over complex fading channels. The paper is published in the open-access journal arXiv:\xa0http://www.arxiv.org/2013/01061/arXiv-PAM-Detecting-QAM-Signals-Over-Complex-valued-Fading-Channels.html#storylink=cpy. We also present a new low-complexity lattice-decoding a lgorithms for non-coherent block detection of PAM and 