In [49]:
import json
import os
import requests
import fitz  # PyMuPDF for reading PDF files
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Function to read the JSON dataset
def read_json_dataset(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

# Function to download PDFs
def download_pdf(url, pdf_name):
    print(f"Attempting to download: {url}")  # Print URL before download attempt
    try:
        response = requests.get(url, verify=False)  # Disable SSL verification
        if response.status_code == 200:
            with open(pdf_name, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {pdf_name}")
            return pdf_name
        else:
            print(f"Failed to download {url} with status code: {response.status_code}")
            return None
    except requests.exceptions.SSLError as ssl_error:
        print(f"SSL Error: {ssl_error}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function for frequency-based summarization
def frequency_based_summary(text, summary_ratio=0.2):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate word frequencies
    word_frequencies = Counter(words)

    # Sentence Tokenization
    sentences = sent_tokenize(text)
    sentence_scores = {}

    # Score each sentence based on word frequency
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = 0
                sentence_scores[sentence] += word_frequencies[word]

    # Sort sentences by score and select top sentences
    num_sentences = int(len(sentences) * summary_ratio)
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

    # Join selected sentences to form the summary
    return ' '.join(summary_sentences)

# Function for keyword extraction
def keyword_extraction(text, num_keywords=5):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate word frequencies
    word_frequencies = Counter(words)

    # Extract the most common keywords
    keywords = word_frequencies.most_common(num_keywords)
    return [word for word, _ in keywords]

# Main function to process PDFs from JSON
def main():
    # Load the dataset
    json_path = "C:\\Users\\Hp\\Desktop\\Dataset.json"  # Adjust to your actual path
    data = read_json_dataset(json_path)

    summaries = {}
    all_keywords = {}
    
    # Process each PDF URL
    for key, url in data.items():
        print(f"Processing {key}: {url}")
        # Download the PDF
        pdf_name = f"{key}.pdf"  # Name the downloaded PDF file
        downloaded_pdf = download_pdf(url, pdf_name)

        if downloaded_pdf:
            # Extract text from the downloaded PDF
            text = extract_text_from_pdf(downloaded_pdf)
            if text:
                # Summarize the text
                summary = frequency_based_summary(text)
                summaries[key] = summary
                print(f"Summary for {key}: {summary}")
                
                # Extract keywords
                keywords = keyword_extraction(text)
                all_keywords[key] = keywords
                print(f"Keywords for {key}: {keywords}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing pdf1: https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUwL3ZvbHVtZSAxL1BhcnQgSS9Db21taXNzaW9uZXIgb2YgSW5jb21lIFRheCwgV2VzdCBCZW5nYWxfQ2FsY3V0dGEgQWdlbmN5IEx0ZC5fMTY5NzYwNjMxMC5wZGY=
Attempting to download: https://digiscr.sci.gov.in/pdf_viewer?dir=YWRtaW4vanVkZ2VtZW50X2ZpbGUvanVkZ2VtZW50X3BkZi8xOTUwL3ZvbHVtZSAxL1BhcnQgSS9Db21taXNzaW9uZXIgb2YgSW5jb21lIFRheCwgV2VzdCBCZW5nYWxfQ2FsY3V0dGEgQWdlbmN5IEx0ZC5fMTY5NzYwNjMxMC5wZGY=




Downloaded: pdf1.pdf
Summary for pdf1: Mitter 
or S.C. Mitter, having been and being still the Manag-
ing Agents of the said Mills have undertaken to 
reimburse the said Mills in respect of the decrees to oe 
made in the said four suits in the manner hereinafter 
appearing NOW THESE PRESENTS WITNESS 
AND IT IS HEREBY AGREED AND DECLARED 
(i) That out of the commission of 3% payable by 
the said Mills to the said Agency under Regulation 131 
of the Articles of Association of the Company, the 
Company shall have paramount lien on and deduct 
and set off a moiety thereof against any payment 
which the said Mills may make in respect of the decrees 
or any of them and/or costs of the said suits. Therefore this case is covered 
by the judgment of the Court of Appeal in Mitchell's 
case(') ...... " Applying this line of reasoning the High 
Court differed from the conclusion of the Tribunal and 
allowed the deduction to the respondent company under 
section 10(2) (xv) of the Income-tax Act, as



Downloaded: pdf2.pdf
Summary for pdf2: 1952 
The State of 
Bihar 
v. 
Maharaja-
dhiraja Sir 
Kameshwar 
Singh 
of Darbhanga 
and Others. 1952 
The State of 
Bihar 
v. 
Maharaia-
dhiraja Sir 
Kameshwar 
Singh 
of Darbhanga 
and Others. If the require-
ment of a public purpose were not a provision of 
article 31 (2), then it will obviously lead us to the un-
tenable conclusion that Parliament will be free under 
its residuary powers under article 248 and entry 97 of 
List I of the Seventh Schedule to make a law for ac-
quiring private property without any public purpose 
at all and to the still more absurd result that while 
Parliament will 
have 
to 
provide for 
compensation 
under article 31 (2) in a law made by it for acquisitio• 
of property for a public purpose it will not have to 
make any provision for compensation in a law made 
for acquisition of property to be made without a public 
purpose. We find that under article 246 Parliament 
has exclusive power to make laws with respe



Downloaded: pdf3.pdf
Summary for pdf3: 2. ______________________________________________
1. Subs. Amendment of  First  Schedule  to Constitution.— As from the 
appointed day, in the First Schedule to the Constitution,—
(a) in the paragraph relating to the territories of the State of  
Assam, the words, brackets and figures “and the territories referred to in 
Part I of the Second Schedule to the Constitution (One Hundredth 
Amendment) Act, 2015, notwithstanding anything contained in clause 
(a) of section 3 of the Constitution (Ninth Amendment) Act, 1960, so far 
as it relates to the territories referred to in Part I of the Second Schedule 
to the Constitution (One Hundredth Amendment) Act, 2015”, shall be 
added at the end; 
(b) in the paragraph relating to the territories of the State of West 
Bengal, the words, brackets and figures “and also the territories referred to 
in Part III of the First Schedule but excluding the territories referred to in 
Part III of the Second Schedule to



Downloaded: pdf4.pdf
Summary for pdf4: Whoever commits mischief by,—
(a) doing any act which causes, or which he knows to be likely to cause, a
diminution of the supply of water for agricultural purposes, or for food or drink for
human beings or for animals which are property, or for cleanliness or for carrying on
any manufacture, shall be punished with imprisonment of either description for a term
which may extend to five years, or with fine, or with both;
(b) doing any act which renders or which he knows to be likely to render any
public road, bridge, navigable river or navigable channel, natural or artificial, impassable
or less safe for travelling or conveying property, shall be punished with imprisonment
of either description for a term which may extend to five years, or with fine, or with
both;
(c) doing any act which causes or which he knows to be likely to cause an
inundation or an obstruction to any public drainage attended with injury or damage,
shall be punished with impriso



Downloaded: pdf5.pdf
Summary for pdf5: (Puneet Pancholy)  
 
Press Release: 2024-2025/609                                            Chief General Manager 
 
ᮧेस ᮧकाशनी PRESS RELEASE 
 
 
                                                       
भारतीय ᳯरज़वर् बᱹक 
 
RESERVE BANK OF INDIA 
 
वेबसाइट : www.rbi.org.in/hindi 
संचार िवभाग, कᱶᮤीय कायार्लय, शहीद भगत ᳲसंह मागर्, फोटर्, मुंबई - 400 001 
Website : www.rbi.org.in 
Department of Communication, Central Office, Shahid Bhagat Singh Marg, Fort,  
ई-मेल/email : helpdoc@rbi.org.in 
Mumbai - 400 001 फोन/Phone: 022 - 2266 0502 Kar has experience of over three decades in the Reserve Bank having worked in 
areas of Payment and Settlement Systems, Information Technology, Government Banking, 
Internal accounts and Human Resource Management. Charulatha S Kar as new Executive Director 
     The Reserve Bank of India (RBI) has appointed Smt.
Keywords for pdf5: ['smt', 'kar', 'reserve', 'bank', 'management']
Processing pdf6: https://digiscr.sci.gov



Downloaded: pdf6.pdf
Summary for pdf6: In resisting the workmen's claim for bonus for the year 
1955-56 the appellant contended that in calculating gross profits 
for the purpose of the Full Bench formula the following items of 
income should be excluded :-
(i) Income earned by way of rent, light and power ; 
(ii) estate revenue derived from sale of excess coconuts used 
in preparing oil grown in the appellant's groves; 
(iii) profit from sale of empty barrels ; and 
(iv) sale proceeds of tin cans, scraps, Jogs, planks, gunnies 
etc. There is no 
doubt that there must be contribution of the workmen 
in earning profits before they are entitled to profit 
bonus; but it wa.s not laid down in the Muir Mills \ 
Case(') that direct connection between the efforts of the 
workmen and the particular item of profit earned must 
be established before the profit can be taken into 
account for the purposes of arriving at the available 
(1) 1955 (1) S.C.R. SUPREME COURT REPORTS 
3 
The ease of the w



Downloaded: pdf7.pdf
Summary for pdf7: , As a result of the termination of the services of the 
one hundred and forty two employees of the Company, 
as aforesaid, an industrial dispute was raised at the 
instance of the said employePs whose list is attached 
to the order of Reference, dated August 26, 1954, 
which is in these terms:-
"Whereas an Industrial dispute exists between (1) 
Messrs. Great Inciian l\Iotor Works Ltd., 33, Rowland 
Road, Calcutta, 
represented 
by their Managing 
Directors Sri C. D. Nundy and Sri K. D. Nundy, (2) 
Official Liquidators of the Company, Sri D. L. Dutta 
and Sri C. D. Nundy, 33, Rowland Road, Calcutta 
and (3) Sri K. D. Nundy, Auction Purchaser of the 
Company, 33, Rowland Road, Calcutta, and their 142, 
employees, given in the enclosed list, represented by 
• 
• 
• 
S.C.R. The liquidators were refused sanction to appeal from the 
said award by the High Court whereupon the auction-purchaser 
who was also the managing director of the Company, prior to



Downloaded: pdf8.pdf
Summary for pdf8: As to the benefits arising out of the 
service rendered by the workmen under Messrs. M. M. 
Ispahani Ltd., it held that there was termination of 
employment of the workmen when Messrs. M. M. 
Ispahani migrated to Pakistan and the employment of 
the workmen by the company was fresh employment 
and they therefore were not entitled to any benefits 
arising out of their employment under Messrs. M. M. 
Ispahani Ltd. As to the quantum of bonus it 
r959 
was laid down that even if payment was not at a 
Ispahani Ltd., 
uniform rate throughout the period, the implied agree-
Calcutta 
ment to pay something could be inferred and it would 
v. 
be for the tribunal to decide what was the reasonable 
Ispahani 
amount to be paid as puja bonus. It held on the question of bonus 
that it had been proved that puja bonus had become a 
term of employment and the workmen were therefore 
entitled to bonus at the rate of one month's wages for 
the year 1953. It may either



Downloaded: pdf9.pdf
Summary for pdf9: In Messrs. Basa Musa 
Sugar Works (Private) Ltd. v. Shobrati Khan (1), we had 
occasion to point out that even where the employer 
did not hold an enquiry before applying under s. 33 of 
the Act for permission to dismiss an employee, he 
could make good the defect by producing all relevant 
evidence which would hare been examined at the 
enquiry, before the tribunal, in which case the tribunal 
would consider the evidence and decide whether per-
mission should be granted or not. Held, that the question whether the Tribunal was a compe-
tent one under s. 7 of the Industrial Disputes Act, 1947, prior to 
the amending Act 36 of 1956, must be raised before the Tribunal 
itself as it was a matter of investigation and could not be raised 
for the first time before the Supreme Court. SUPREME COURT REPORTS 
35 
Two points have been urged before us on behalf of 
z959 
the company, namely-
Ph lb 
· T 
E 1 1 
h T 'b 
l 
t t 'b 
l 
u an ea 
s a e 
(1) t e 
r1



Downloaded: pdf10.pdf
Summary for pdf10: the sugar from one place to another, or 
to the delay likely to be involved in exporting it,.or to 
the conditions prevailing in the markets for sugar, 
whether in or out of India, or to any other relevant 
circumstance, it is expedient so to do, the export agency 
may sell the whole or any part of the sugar in India 
46 
SUPREME COURT REPOHTS [1960(1)] 
'959 
>md may, if it thinks fit, purchase such quantity of 
-
sugar as it may co1'.sider necessary for export at the 
-
TheLordf\.rishna 
· t t'' 
• 
Sugar ,1till8 Ltd., <.1})J. Section 8 states that the export agency 
shall export the sugar delivered to it, provided that in 
certain circumstances specified, the export agency may 
sell that sugar in India and may if it thinks fit pur- · 
chase other sugar for export and for this purpose permit 
the owner to sell the whole or part of its export .quota 
at a price approved, on condition t.hat the sale proceeds 
are paid to it. If two or 
more Acts



Downloaded: pdf11.pdf
Summary for pdf11: The main submission on behalf of the appellant was 
directed towards establishing that the entire proceed: 
ings before the Additional District Magistrate and the 
trying Magistrate were without jurisdiction as cogniz-
ance of the offence had been taken on September 16, 
1952, in contravention of the provisions of s. 23(3) of 
the ]foreign Exchange Regulation Act, there being on 
that date no complaint in writing made by an officer 
authorised in that behalf by the Central Government 
or the Reserve Bank of India by a general or a special 
order. On September 11, 1952, the 
Reserve Bank of India authorized Inspector S. B. 
Mitra of the Special Police Establishment, Calcutta, to 
make a representation to the Additional District 
Magistrate, 24 Parganas (hereinafter referred to as the 
Addtiona1 District Magistrate) for 
permission to 
proceed against the appellant as required under 
s. 19(3) of the Foreign Exchange Regulation Act, 1947. In the pr



Downloaded: pdf12.pdf
Summary for pdf12: Held, that the workmen were not entitled to puja bonus as 
an implied term of employment for an implied agreement could 
not be inferred when the appellant had made it clear that the 
payments from 1948 to 1952 were ex gratia ; but they were 
entitled to puja bonus on the basis that it was a customary and 
traditional payment. In dealing with puja bonus based on an 
'implied term of employment, it was pointed out by us 
in Messrs. lspahani Ltd. v. lspahani Employees' 
Union (1) that a term may be implied, even though the 
payment may not have been at a uniform rate 
throughout and the Industrial Tribunal would be 
justified in deciding what should be the quantum of 
payment in a particular year taking into account the 
varying payments made in pr1;1vious years. In determining whether the payment was 
customary and traditional the following circumstances have to be 
established : 
(i) that the payment has been made over an unbroken series 
of yea



Downloaded: pdf13.pdf
Summary for pdf13: "(l) If ... 
(a) the Income-tax Officer has reason to believe 
that.by reason of the omission or failure· on the part 
of an assessee to make a return of his income under 
section 22, for any year ...... , or 
(b) notwithstimrling that there has been no omis-
sion or failure as mentioned in clause (a) on the part 
of the assessee, the Income-tax Officer has in con-
sequence of information in his possession reason to 
believe that income, profits or gains chargeable to 
income-tax have escaped assessment for any year ... 
* 
* 
* 
he may in cases falling under clause (a) at any time 
within eight years and in cases falling under clause 
(h) at any time 'vithin four years of the end of that 
year, serve on the assessee ... a notice ... and may 
proceed to assess such income ... " 
It would appear from this that if the return filed on 
January 5, 1950, was a return of income, there was no 
failure or omission on the part of the asscssee, so as to 




Downloaded: pdf14.pdf
Summary for pdf14: In the present case the jury had been clearly directed 
by the Sessions Judge that corroborative evidence 
must be evidence which implicates the accused, i.e., 
which confirms in some material particulars not only 
the evidence that the crime had been committed but 
also that the appellant had committed it. 953, that the approver's 
evidence had to satisfy a double test i.e., he must be a reliable 
witness and his evidence must receive sufficient corroboration and 
in that the corroborating evidence was not sufficient to connect 
the appellant with the crime. Indeed, the Sessions Judge went to the length of 
telling the jury that although an approver's evidence 
is strictly admissible and a conviction is not illegal 
merely because it is based on an approver's evidence, 
it was a settled rule of practice not to convict a person 
on such evidence except under very rare and excep-
tional circumstances and usually substantial cor-
roboratLm was req



Downloaded: pdf15.pdf
Summary for pdf15: The Authority under the Payment of Wages Act 
found that only four of the respondents, who were required to do 
the work of progress timekeepers, could claim the status of 
workers within the meaning of s. 2(1) of the Factories Act and 
the rest were merely employees of the workshop, but the 
Authority accepted the alternative case made by the respond-
ents and directed the appellant to file a statement showing the 
overtime wages due to each of the respondents and ordered it to 
pay the same. 144 
SUPREM~ COURT REPORTS (1960(1)] 
z959 
The respondents' case, however, is that by virtue of 
s. 70 of the Act the provisions of the Factories Act, 
B. P~.HiY• 
including s. 59, are extended to the cases of all em-
c. M. Pradhan ployees in factories, and so they are entitled to claim 
-
wages for overtime under the said section of the 
Gajendragadkar J. Thus the validity of the said section is not 
in dispute; and so the only point which calls for our 



Downloaded: pdf16.pdf
Summary for pdf16: The order log and trade 
log revealed that there was synchronization of buying and selling orders 
by these brokers as under: 
 
Date 
Trade 
Time 
Tr 
No 
Order NO 
Order time 
Diff  
(sec)  B/S 
Mem Client Code 
Qty  
Price 
27-02-01 12:16:13 
1 
61175322 
12:16:13 
**** buy 
35 rathyatrasa 
10000 
335.00 
27-02-01 12:16:13 
1 
31317843 
12:15:37 
0:00:36 sell 
545 b110 
10000 
335.00 
27-02-01 12:16:56 
2 
61175333 
12:16:56 
**** buy 
35 rathyatrasa 
10000 
335.00 
27-02-01 12:16:56 
2 
31317859 
12:16:32 
0:00:24 sell 
545 b110 
10000 
335.00 
27-02-01 12:17:18 
3 
61175338 
12:17:18 
**** buy 
35 rathyatrasa 
10000 
335.00 
27-02-01 12:17:18 
3 
31317864 
12:17:07 
0:00:11 sell 
545 b110 
10000 
335.00 
27-02-01 12:17:27 
4 
61175342 
12:17:27 
**** buy 
35 rathyatrasa 
10000 
335.00 
27-02-01 12:17:27 
4 
31317867 
12:17:17 
0:00:10 sell 
545 b110 
10000 
335.00 
27-02-01 12:17:33 
5 
61175347 
12:17:33 
**** buy 
35 rathyatrasa 
10000 
3



Downloaded: pdf17.pdf
Summary for pdf17: …………  
Signature 
 
Appendix – 5 
izk:i la[;k 45 
 
Fkkus esa U;k;ky; ds Hkkjlk/kd vf/kdkjh ds le{k 
gkftj gksus ds fy, ca/ki= vkSj tekuri= ¼/kkjk 
436] 436d] 437] 437d] 438¼3½] vkSj 441 nsf[k,½ 
 
izsf"krh ------------------ 
 
 
eSa ¼uke½ ¼LFkku½ dk fuolh gWw -------------
--------- Fkkus ds Hkkjlk/kd vf/kdkjh }kjk] fcuk 
okj.V  fxjQrkj ;k fu:) dj fy, tkus ij ¼;k ------
------- U;k;ky; ds le{k yk, tkus ij½ vijk/k ls 
vkjksfir fd;k x;k gWw rFkk eq>ls ,sls vf/kdkjh 
;k U;k;ky; ds le{k ,sls izR;sd fnu] gkftj 
52 
 
gksÅaxk] ftlesa ,sls vkjksi ds ckcr dksbZ 
vUos"k.k 
;k 
fopkj.k 
fd;k 
tk,] 
rFkk 
ijh{k.kksijkUr 
,oa 
fu.kZ;ksijkUr 
,sls 
mPprj 
U;k;ky; ds le{k] ftlds }kjk fu.kZ; dh frfFk ls 6 
ekg dh vof/k ds Hkhrj ;fn fu.kZ; ds fo:) nkf[ky 
fdlh vihy ;k fiVh’ku esa uksfVl tkjh fd;k tkrk 
gS] mifLFkr gksÅaxk rFkk eSa vius dks vkc) djrk 
gWw fd ;fn blls pwd d:W rks esjh ---------------
-- jkf’k ljdkj dks le"kg`r gks tk;sxhA 
rk0------------------



SSL Error: [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2580)


In [58]:
import json
import os
import requests
import fitz  # PyMuPDF for reading PDF files
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor
import time

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# MongoDB setup
client = MongoClient('mongodb://localhost:27017/')  # Adjust your MongoDB URI as needed
db = client['pdf_processing']  # Replace with your database name
collection = db['documents']  # Replace with your collection name

# Function to read the JSON dataset
def read_json_dataset(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

# Function to download PDFs
def download_pdf(url, pdf_name):
    print(f"Attempting to download: {url}")  # Print URL before download attempt
    try:
        response = requests.get(url, verify=False)  # Disable SSL verification
        if response.status_code == 200:
            with open(pdf_name, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f"Downloaded: {pdf_name}")
            return pdf_name
        else:
            print(f"Failed to download {url} with status code: {response.status_code}")
            return None
    except requests.exceptions.SSLError as ssl_error:
        print(f"SSL Error: {ssl_error}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

# Function for frequency-based summarization
def frequency_based_summary(text, summary_ratio=0.2):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate word frequencies
    word_frequencies = Counter(words)

    # Sentence Tokenization
    sentences = sent_tokenize(text)
    sentence_scores = {}

    # Score each sentence based on word frequency
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = 0
                sentence_scores[sentence] += word_frequencies[word]

    # Sort sentences by score and select top sentences
    num_sentences = int(len(sentences) * summary_ratio)
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

    # Join selected sentences to form the summary
    return ' '.join(summary_sentences)

# Function for keyword extraction
def keyword_extraction(text, num_keywords=5):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate word frequencies
    word_frequencies = Counter(words)

    # Extract the most common keywords
    keywords = word_frequencies.most_common(num_keywords)
    return [word for word, _ in keywords]

# Function to store document metadata in MongoDB
def store_metadata(pdf_name, pdf_path):
    metadata = {
        'name': pdf_name,
        'path': pdf_path,
        'size': os.path.getsize(pdf_path)  # Get the file size
    }
    collection.insert_one(metadata)
    print(f"Stored metadata for {pdf_name}")

# Function to update MongoDB with summary and keywords
def update_document(key, summary, keywords):
    update_query = {'name': key}
    new_values = {"$set": {"summary": summary, "keywords": keywords}}
    collection.update_one(update_query, new_values)
    print(f"Updated {key} with summary and keywords.")

# Function to process a single PDF
def process_pdf(pdf_path):
    pdf_name = os.path.basename(pdf_path)  # Get the file name
    print(f"Processing {pdf_name}...")  # Log when processing starts
    store_metadata(pdf_name, pdf_path)  # Store metadata in MongoDB

    start_time = time.time()  # Start timer

    # Extract text from the downloaded PDF
    text = extract_text_from_pdf(pdf_path)
    
    if text:
        print(f"Extracted text from {pdf_name}")  # Log when text extraction is successful

        # Summarize the text
        summary = frequency_based_summary(text)
        print(f"Summary for {pdf_name}: {summary}")
        
        # Extract keywords
        keywords = keyword_extraction(text)
        print(f"Keywords for {pdf_name}: {keywords}")

        # Update MongoDB with summary and keywords
        update_document(pdf_name, summary, keywords)
    else:
        print(f"No text extracted from {pdf_name}.")  # Log if no text was extracted

    end_time = time.time()  # End timer
    print(f"Processed {pdf_name} in {end_time - start_time:.2f} seconds.")

# Function to process all PDFs in a specified folder
def process_pdfs_in_folder(folder_path):
    # List all PDF files in the specified folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    print(f"Found PDF files: {pdf_files}")  # Print the list of found PDF files
    
    with ThreadPoolExecutor() as executor:
        for pdf_file in pdf_files:
            pdf_path = os.path.join(folder_path, pdf_file)
            print(f"Submitting {pdf_path} for processing...")  # Log which PDF is being processed
            executor.submit(process_pdf, pdf_path)  # Submit each PDF for processing

# Main function to execute the PDF processing pipeline
def main():
    folder_path = r"C:\Users\Hp\Desktop\DownloadedPDFs"  # Adjust to your actual folder path
    process_pdfs_in_folder(folder_path)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Found PDF files: ['1292585113260.pdf', '20240716890312078.pdf', '250883_english_01042024.pdf']
Submitting C:\Users\Hp\Desktop\DownloadedPDFs\1292585113260.pdf for processing...
Processing 1292585113260.pdf...
Submitting C:\Users\Hp\Desktop\DownloadedPDFs\20240716890312078.pdf for processing...
Processing 20240716890312078.pdf...
Submitting C:\Users\Hp\Desktop\DownloadedPDFs\250883_english_01042024.pdf for processing...
Processing 250883_english_01042024.pdf...
