In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from time import sleep

print('\n')
print('Writing PDF files....')

doilist = open('dois2.txt', 'r')
dois = doilist.readlines()
output_folder = 'BVac_AI_Papers/new_papers/batch11'

for doi in tqdm(dois):
    try:
        pdf_name = doi.strip().replace('/', '-')
        base_url = 'https://sci-hub.se/'
        response = requests.get(base_url + doi.strip())

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve page for {doi.strip()}, status code: {response.status_code}")
            raise Exception("Invalid response")

        soup = BeautifulSoup(response.content, 'html.parser')

        # Try finding the embed tag to get the PDF link
        embed = soup.find('embed')
        if embed:
            content = embed.get('src').replace('#navpanes=0&view=FitH', '').replace('//', '/')
        else:
            print(f"No embed tag found for {doi.strip()}")
            raise Exception("No embed tag found")

        # Handle different types of URL structures
        if content.startswith('/downloads'):
            pdf = 'https://sci-hub.se' + content
        elif content.startswith('/tree'):
            pdf = 'https://sci-hub.se' + content
        elif content.startswith('/uptodate'):
            pdf = 'https://sci-hub.se' + content
        else:
            pdf = 'https:/' + content  # General case

        # Attempt to download the PDF
        print(f"Downloading PDF for {doi.strip()} from {pdf}")
        r = requests.get(pdf, stream=True)
        if r.status_code == 200:
            with open(output_folder + '/' + pdf_name + '.pdf', 'wb') as file:
                file.write(r.content)
            with open('PDFs_Found.txt', 'a') as pdfs:
                pdfs.write(doi.strip() + '\t' + pdf + '\n')
        else:
            print(f"Failed to download PDF for {doi.strip()}")
            raise Exception("Failed to download PDF")

    except Exception as e:
        print(f"Error for DOI {doi.strip()}: {str(e)}")
        with open('PDFs_Not_Found.txt', 'a') as Nopdfs:
            Nopdfs.write(doi.strip() + '\n')

    # Pause to avoid rate-limiting
    sleep(3)




Writing PDF files....


  0%|          | 0/84 [00:00<?, ?it/s]

Downloading PDF for 10.1128/AAC.00714-11 from https://moscow.sci-hub.se/3669/1541f7ac90b9e4232a26146139bbe8ef/lebreton2011.pdf


  1%|          | 1/84 [00:05<08:11,  5.92s/it]

Downloading PDF for 10.1007/bf00330501 from https://moscow.sci-hub.se/1869/5e7b1fe674822af0042e1e1143a2bc33/friedrich1988.pdf


  2%|▏         | 2/84 [00:10<07:17,  5.33s/it]

Downloading PDF for 10.1016/0014-5793(92)80039-J from https://moscow.sci-hub.se/1591/9677f77cd962bc1ce455f949f004be3f/hiramatsu1992.pdf


  4%|▎         | 3/84 [00:16<07:19,  5.42s/it]

Downloading PDF for 10.1016/j.bbapap.2011.09.011 from https://dacemirror.sci-hub.se/journal-article/048a402a0a76765853a4af15888b0727/patil2011.pdf


  5%|▍         | 4/84 [00:22<07:42,  5.78s/it]

Downloading PDF for 10.1016/j.chom.2016.05.007 from https://zero.sci-hub.se/5378/ad2c05ffb24d553a6edcf3c8c362835c/nairn2016.pdf


  6%|▌         | 5/84 [00:29<07:58,  6.06s/it]

Downloading PDF for 10.1016/s0140-6736(02)08713-5 from https://moscow.sci-hub.se/1748/fa9f1ab2509d0f1c8ca486c8d428b058/baba2002.pdf


  7%|▋         | 6/84 [00:35<07:58,  6.14s/it]

Downloading PDF for 10.1016/s0378-1119(98)00060-2 from https://moscow.sci-hub.se/1775/7f0141abc2ff870974dda5e779bf9fa7/nguyen1998.pdf


  8%|▊         | 7/84 [00:40<07:21,  5.73s/it]

Downloading PDF for 10.1016/S0923-2508(97)87644-9 from https://moscow.sci-hub.se/1785/99b88e2af8dadefa754026bbbf963f66/climent1997.pdf


 10%|▉         | 8/84 [00:45<07:02,  5.56s/it]

Downloading PDF for 10.1021/bi0496945 from https://moscow.sci-hub.se/2670/d6a2c0f21d3aefeb75a095b5f925c3e4/endrizzi2004.pdf


 11%|█         | 9/84 [00:51<06:58,  5.58s/it]

Downloading PDF for 10.1038/35023079 from https://dacemirror.sci-hub.se/journal-article/36b06165fda477407a27b286a8ca1d47/olson2000.pdf


 12%|█▏        | 10/84 [00:56<06:52,  5.57s/it]

Downloading PDF for 10.1038/ncomms15638 from https://dacemirror.sci-hub.se/journal-article/711dcc5046d96cd38fd9d4cb1605f027/marie2017.pdf


 13%|█▎        | 11/84 [01:03<07:17,  6.00s/it]

Downloading PDF for 10.1073/pnas.81.9.2645 from https://zero.sci-hub.se/5515/6a3c431acedb21e7ec47cca91e8aa538/gray1984.pdf


 14%|█▍        | 12/84 [01:09<07:00,  5.84s/it]

Downloading PDF for 10.1074/jbc.273.14.8193 from https://moscow.sci-hub.se/3666/fb481504426b4ea3d2c5b7408b5464ef/sullivan1998.pdf


 15%|█▌        | 13/84 [01:15<06:57,  5.88s/it]

Downloading PDF for 10.1080/21505594.2018.1558693 from https://sci-hub.se/downloads/2019-02-22/23/weidensdorfer2019.pdf


 17%|█▋        | 14/84 [01:20<06:48,  5.84s/it]

Downloading PDF for 10.1099/mic.0.26322-0 from https://zero.sci-hub.se/4065/7240d6ea0477a5fedfbf78c2f63a5eed/broms2003.pdf


 18%|█▊        | 15/84 [01:26<06:32,  5.69s/it]

Downloading PDF for 10.1099/mic.0.26518-0 from https://moscow.sci-hub.se/3802/538f89720da8e7ad30874c7000e3e075/chandu2003.pdf


 19%|█▉        | 16/84 [01:31<06:12,  5.48s/it]

Downloading PDF for 10.1111/j.1365-2958.1993.tb02674.x from https://dacemirror.sci-hub.se/journal-article/da68009e957886d7d5c38dd37b4d1c61/akrim1993.pdf


 20%|██        | 17/84 [01:38<06:45,  6.05s/it]

Downloading PDF for 10.1111/j.1365-2958.1994.tb01046.x from https://moscow.sci-hub.se/916/26906f93e6e12c45945ebbdd035e8859/10.1111@j.1365-2958.1994.tb01046.x.pdf


 21%|██▏       | 18/84 [01:44<06:28,  5.89s/it]

Downloading PDF for 10.1111/j.1574-6968.2006.00251.x from https://moscow.sci-hub.se/983/6cdedafd7fb4654e93beeeb8c8f752c7/srinivasan2006.pdf


 23%|██▎       | 19/84 [01:49<06:06,  5.64s/it]

Downloading PDF for 10.1128/aac.00155-10 from https://zero.sci-hub.se/3702/1cc5827632a18c0e7596dd8021780d7f/coyne2010.pdf


 24%|██▍       | 20/84 [01:54<05:52,  5.50s/it]

Downloading PDF for 10.1128/aac.00534-07 from https://dacemirror.sci-hub.se/journal-article/3b9267d1db79a0c6619e440c840182ac/neoh2007.pdf


 25%|██▌       | 21/84 [01:59<05:38,  5.37s/it]

Downloading PDF for 10.1128/aac.36.8.1791 from https://dacemirror.sci-hub.se/journal-article/8dab1188448863f6b6410ae6a8b47149/yoneyama1992.pdf


 26%|██▌       | 22/84 [02:05<05:36,  5.42s/it]

No embed tag found for 10.1128/iai.63.3.903-910.1995
Error for DOI 10.1128/iai.63.3.903-910.1995: No embed tag found


 27%|██▋       | 23/84 [02:08<04:57,  4.87s/it]

Downloading PDF for 10.1128/jb.170.1.155-162.1988 from https://dacemirror.sci-hub.se/journal-article/42df050951a1cecdcea99927c3a4f753/10.1128@jb.170.1.155-162.1988.pdf


 29%|██▊       | 24/84 [02:14<05:10,  5.17s/it]

Downloading PDF for 10.1159/000076741 from https://moscow.sci-hub.se/3291/204cbc08cec079f27f6af9185a09ca9d/zalacain2004.pdf


 30%|██▉       | 25/84 [02:19<05:05,  5.18s/it]

Downloading PDF for 10.1021/bi5005188 from https://moscow.sci-hub.se/2712/d0755368164280fa8e9caf04b6cbf608/fonner2014.pdf


 31%|███       | 26/84 [02:25<05:09,  5.33s/it]

No embed tag found for 10.1021/bi5005189
Error for DOI 10.1021/bi5005189: No embed tag found


 32%|███▏      | 27/84 [02:29<04:34,  4.82s/it]

No embed tag found for 10.1021/bi5005190
Error for DOI 10.1021/bi5005190: No embed tag found


 33%|███▎      | 28/84 [02:32<04:11,  4.49s/it]

No embed tag found for 10.1021/bi5005191
Error for DOI 10.1021/bi5005191: No embed tag found


 35%|███▍      | 29/84 [02:36<03:52,  4.22s/it]

Downloading PDF for 10.1016/s0021-9258(17)46223-5 from https://sci-hub.se/downloads/2021-05-13/ab/chaudhary1990.pdf


 36%|███▌      | 30/84 [02:41<04:06,  4.57s/it]

Downloading PDF for 10.1111/j.1432-1033.1990.tb19238.x from https://moscow.sci-hub.se/1498/2c6875228f78d2296d8afc50c01b9f90/bourdenet1990.pdf


 37%|███▋      | 31/84 [02:47<04:14,  4.79s/it]

Downloading PDF for 10.1016/s0021-9258(18)42291-0 from https://sci-hub.se/downloads/2021-05-13/c7/kounnas1992.pdf


 38%|███▊      | 32/84 [02:52<04:24,  5.08s/it]

Error for DOI 10.1021/bi991308+: Exceeded 30 redirects.


 39%|███▉      | 33/84 [03:03<05:48,  6.84s/it]

Downloading PDF for 10.1074/jbc.m710008200 from https://zero.sci-hub.se/3657/77784eaf8edae9bb7088a0f9e59787fb/jorgensen2008.pdf


 40%|████      | 34/84 [03:09<05:24,  6.50s/it]

Downloading PDF for 10.1128/aac.01164-10 from https://dacemirror.sci-hub.se/journal-article/3ced160a3c891c209148aa370f5cc54b/turgeon2010.pdf


 42%|████▏     | 35/84 [03:15<05:18,  6.50s/it]

Downloading PDF for 10.1073/pnas.92.20.9308 from https://dacemirror.sci-hub.se/journal-article/5fe3d42121b66aa9af51a3efda39b3ca/li1995.pdf


 43%|████▎     | 36/84 [03:21<05:00,  6.26s/it]

Downloading PDF for 10.1073/pnas.93.14.6902 from https://dacemirror.sci-hub.se/journal-article/33b74d864413bfde88307a3b9dbc89f7/li1996.pdf


 44%|████▍     | 37/84 [03:27<04:52,  6.21s/it]

Downloading PDF for 10.1038/embor.2008.90 from https://moscow.sci-hub.se/1044/3aa757159c463ce6417cd5560acaab95/10.1038@embor.2008.90.pdf


 45%|████▌     | 38/84 [03:33<04:37,  6.02s/it]

Downloading PDF for 10.1159/000468649 from https://sci-hub.se/downloads/2020-05-06/65/10.1159@000468649.pdf


 46%|████▋     | 39/84 [03:38<04:22,  5.84s/it]

Downloading PDF for 10.1016/s0021-9258(18)55452-1 from https://sci-hub.se/downloads/2021-05-15/a2/trias1990.pdf


 48%|████▊     | 40/84 [03:43<04:06,  5.60s/it]

Downloading PDF for 10.1016/0014-5793(96)00945-3 from https://dacemirror.sci-hub.se/journal-article/9a7a8b0a3efa8341a9537ef1c2e50075/yoshihara1996.pdf


 49%|████▉     | 41/84 [03:49<03:56,  5.50s/it]

Downloading PDF for 10.1006/bbrc.1998.8745 from https://dacemirror.sci-hub.se/journal-article/d8125461b1f47f9f1c07a6f3583c5ea3/yoshihara1998.pdf


 50%|█████     | 42/84 [03:54<03:48,  5.44s/it]

Downloading PDF for 10.1016/0378-1097(92)90347-q from https://zero.sci-hub.se/2246/e8d202f0dbf337a17634620a7b21f44f/huang1992.pdf


 51%|█████     | 43/84 [03:59<03:40,  5.39s/it]

Downloading PDF for 10.1016/J.JMB.2016.03.012 from https://moscow.sci-hub.se/5023/28105c0edf3a6cfeea37cf0a1b58e4b9/damatamadeira2016.pdf


 52%|█████▏    | 44/84 [04:05<03:36,  5.42s/it]

Downloading PDF for 10.1016/s0969-2126(99)80055-0 from https://dacemirror.sci-hub.se/journal-article/962c583e54eb270df4d46ce9aff1521e/dutzler1999.pdf


 54%|█████▎    | 45/84 [04:10<03:31,  5.43s/it]

Downloading PDF for 10.1038/35023079 from https://moscow.sci-hub.se/1037/36b06165fda477407a27b286a8ca1d47/olson2000.pdf


 55%|█████▍    | 46/84 [04:15<03:23,  5.34s/it]

Downloading PDF for 10.1073/pnas.0606863103 from https://zero.sci-hub.se/3467/3ef0ed3cd283ae0ea7bce958a8674ea3/stranger-jones2006.pdf


 56%|█████▌    | 47/84 [04:21<03:20,  5.42s/it]

Downloading PDF for 10.1099/00221287-137-8-1911 from https://dacemirror.sci-hub.se/journal-article/f715197607b3cb18ac0767f64e385500/lawrence1991.pdf


 57%|█████▋    | 48/84 [04:27<03:21,  5.60s/it]

Downloading PDF for 10.1128/AAC.00747-12 from https://zero.sci-hub.se/4014/55e8e0fec13945e4a0e7bc4201ecab93/nomura2012.pdf


 58%|█████▊    | 49/84 [04:32<03:11,  5.47s/it]

Downloading PDF for 10.1128/AAC.45.5.1323-1336.2001 from https://moscow.sci-hub.se/3729/7fbab889061ce651e3544a392e6c1b01/ito2001.pdf


 60%|█████▉    | 50/84 [04:38<03:08,  5.54s/it]

Downloading PDF for 10.1128/jb.01000-07 from https://moscow.sci-hub.se/3995/ac1ace2ccd5dca9e45a7eecc9f0f9719/baba2007.pdf


 61%|██████    | 51/84 [04:43<02:58,  5.41s/it]

Downloading PDF for 10.1371/journal.pbio.1001242 from https://zero.sci-hub.se/6921/0c88aa33fcf56f526e327d7ed99ca233/eren2012.pdf


 62%|██████▏   | 52/84 [04:48<02:54,  5.46s/it]

Downloading PDF for 10.1016/0882-4010(92)90047-r from https://zero.sci-hub.se/2515/95cbeb52b69ef9ad62a7e39306a4a9bb/10.1016@0882-40109290047-r.pdf


 63%|██████▎   | 53/84 [04:54<02:48,  5.43s/it]

Downloading PDF for 10.1016/j.jmb.2008.10.021 from https://zero.sci-hub.se/1678/e9dc6ac7746160d8f6363beed838fea1/renault2009.pdf


 64%|██████▍   | 54/84 [04:59<02:41,  5.38s/it]

Downloading PDF for 10.1016/j.jmb.2015.03.016 from https://zero.sci-hub.se/3871/dc3b2c7fd9ab007419b576889ccbe235/zahn2015.pdf


 65%|██████▌   | 55/84 [05:05<02:39,  5.49s/it]

Downloading PDF for 10.1016/s0021-9258(18)47472-8 from https://sci-hub.se/downloads/2021-05-13/83/carroll1987.pdf


 67%|██████▋   | 56/84 [05:10<02:32,  5.43s/it]

Downloading PDF for 10.1016/S0966-842X(01)02175-8 from https://zero.sci-hub.se/6436/8f30215146d0c95a4bd084395f7626ad/hiramatsu2001.pdf


 68%|██████▊   | 57/84 [05:15<02:23,  5.30s/it]

Downloading PDF for 10.1074/jbc.m112.432096 from https://zero.sci-hub.se/3684/af679e063b543a482c481381a4a327b4/vandermeeren2012.pdf


 69%|██████▉   | 58/84 [05:21<02:20,  5.40s/it]

Downloading PDF for 10.4049/jimmunol.180.1.500 from https://moscow.sci-hub.se/3648/8aed82ada05dccb81f20f8626b06fb21/palazzolo-ballance2007.pdf


 70%|███████   | 59/84 [05:26<02:15,  5.42s/it]

Downloading PDF for 10.1128/mbio.01344-17 from https://zero.sci-hub.se/6555/af3bacb4e173e7cb06e6cf6dcedb86f8/hay2017.pdf


 71%|███████▏  | 60/84 [05:32<02:11,  5.50s/it]

No embed tag found for 10.1128/iai.64.12.5284-5289.1996
Error for DOI 10.1128/iai.64.12.5284-5289.1996: No embed tag found


 73%|███████▎  | 61/84 [05:36<01:54,  4.96s/it]

Downloading PDF for 10.1074/jbc.m104554200 from https://moscow.sci-hub.se/3908/d035ad53a7184594d7ca3c2c4e5e35a1/palma2001.pdf


 74%|███████▍  | 62/84 [05:41<01:52,  5.10s/it]

Downloading PDF for 10.1371/journal.ppat.1003816 from https://zero.sci-hub.se/4276/390e01a3757b5310563dbeb3fc5f57f8/ko2013.pdf


 75%|███████▌  | 63/84 [05:47<01:50,  5.28s/it]

No embed tag found for 10.1099/mic.0.000293
Error for DOI 10.1099/mic.0.000293: No embed tag found


 76%|███████▌  | 64/84 [05:50<01:35,  4.79s/it]

Downloading PDF for 10.1110/ps.036624.108 from https://moscow.sci-hub.se/2054/b8b2ab873d23a9e9d93a34563c98d690/haspel2008.pdf


 77%|███████▋  | 65/84 [05:55<01:33,  4.90s/it]

Downloading PDF for 10.1128/AAC.46.4.1147-1152.2002 from https://dacemirror.sci-hub.se/journal-article/6f9f39f174f4d80e59b0ff48c40ba7e4/ma2002.pdf


 79%|███████▊  | 66/84 [06:01<01:32,  5.15s/it]

Downloading PDF for 10.1016/S1368-7646(03)00003-7 from https://dacemirror.sci-hub.se/journal-article/1d78976b5ad6df82a36de2a07ce3d6e0/ito2003.pdf


 80%|███████▉  | 67/84 [06:07<01:29,  5.25s/it]

Downloading PDF for 10.1128/AAC.49.5.2070-2083.2005 from https://moscow.sci-hub.se/4078/0e8a6e374f71dd00e2311800ebdd3696/shore2005.pdf


 81%|████████  | 68/84 [06:12<01:24,  5.26s/it]

Downloading PDF for 10.1016/j.febslet.2006.03.049 from https://moscow.sci-hub.se/1655/a44e33a766e95845006701313a5e3c0f/taneike2006.pdf


 82%|████████▏ | 69/84 [06:17<01:19,  5.31s/it]

Downloading PDF for 10.1128/JCM.01147-07 from https://moscow.sci-hub.se/3665/22bd70f9028b72d47c2c0eef47e9600f/park2007.pdf


 83%|████████▎ | 70/84 [06:22<01:12,  5.15s/it]

Downloading PDF for 10.1093/jac/dkn186 from https://moscow.sci-hub.se/4043/9b6db8cdc8a173fc9d946ee4e2e86be1/kishii2008.pdf


 85%|████████▍ | 71/84 [06:27<01:06,  5.15s/it]

Downloading PDF for 10.1128/AAC.01118-08 from https://moscow.sci-hub.se/3668/e538c9b624ca12c68125be3f338bf41c/zhang2008.pdf


 86%|████████▌ | 72/84 [06:33<01:04,  5.38s/it]

Downloading PDF for 10.1093/jac/dkn435 from https://moscow.sci-hub.se/4176/977860e57d59138b405ddd02bf04bd42/berglund2008.pdf


 87%|████████▋ | 73/84 [06:38<00:58,  5.30s/it]

Downloading PDF for 10.1128/JCM.00766-09 from https://zero.sci-hub.se/3933/48e8f86e99f577694dadd6e1fb208032/chen2009.pdf


 88%|████████▊ | 74/84 [06:43<00:52,  5.25s/it]

Downloading PDF for 10.1099/jmm.0.009688-0 from https://zero.sci-hub.se/3664/f5007eb3131f1c3c73762dae2947d2cb/park2009.pdf


 89%|████████▉ | 75/84 [06:49<00:47,  5.26s/it]

Downloading PDF for 10.1093/jac/dkq252 from https://zero.sci-hub.se/3727/c30ff7206bd72259218874b220b70a65/jones2010.pdf


 90%|█████████ | 76/84 [06:54<00:41,  5.21s/it]

Downloading PDF for 10.1371/journal.pone.0016193 from https://dacemirror.sci-hub.se/journal-article/3925e009545cb4a9a552ef1a58f85e4b/bartels2011.pdf


 92%|█████████▏| 77/84 [06:59<00:36,  5.27s/it]

Downloading PDF for 10.1093/jac/dks069 from https://moscow.sci-hub.se/1139/4f5f5772a7138058198a681887087f7d/mendes2012.pdf


 93%|█████████▎| 78/84 [07:04<00:31,  5.24s/it]

Downloading PDF for 10.1093/jac/dks157 from https://moscow.sci-hub.se/1117/a0f288b8bfef583b6271b8f114adcb20/urushibara2012.pdf


 94%|█████████▍| 79/84 [07:09<00:25,  5.15s/it]

Downloading PDF for 10.1128/AAC.01321-13 from https://moscow.sci-hub.se/3879/7661a9a95c0f06745a60a5e539e4b7ef/sabat2013.pdf


 95%|█████████▌| 80/84 [07:15<00:20,  5.21s/it]

Downloading PDF for 10.1371/journal.pone.0101419 from https://dacemirror.sci-hub.se/journal-article/cb9e4383bc38e15e0ead16ba591de582/hill-cawthorne2014.pdf


 96%|█████████▋| 81/84 [07:21<00:16,  5.55s/it]

Downloading PDF for 10.1128/AAC.01745-15 from https://dacemirror.sci-hub.se/journal-article/659d84db2aef71af3d5ee4788e9e7434/monecke2015.pdf


 98%|█████████▊| 82/84 [07:27<00:11,  5.72s/it]

Downloading PDF for 10.1093/infdis/jiv320 from https://moscow.sci-hub.se/4106/45faf0a7a3e85c4b25919f68aef81d18/planet2015.pdf


 99%|█████████▉| 83/84 [07:33<00:05,  5.66s/it]

Downloading PDF for 10.1021/jacs.6b12565 from https://dacemirror.sci-hub.se/journal-article/ad315d911cb80d53ececc9cc645283ff/mahasenan2017.pdf


100%|██████████| 84/84 [07:38<00:00,  5.46s/it]


In [3]:
"""Python script that will help view and delete duplicates from a target text file 
by comparing it to another reference text file. The script reads both files, identifies duplicates, 
and then allows you to delete the duplicates from the target file."""

# Function to read a text file and return its lines as a set
def read_file_as_set(filename):
    with open(filename, 'r') as file:
        return set(line.strip() for line in file if line.strip())

# Function to write updated contents to the target file
def write_to_file(filename, lines):
    with open(filename, 'w') as file:
        for line in lines:
            file.write(f"{line}\n")

# Main function to handle viewing and deleting duplicates
def remove_duplicates_from_file(target_file, reference_file):
    # Read both files into sets
    target_lines = read_file_as_set(target_file)
    reference_lines = read_file_as_set(reference_file)
    
    # Identify duplicates
    duplicates = target_lines.intersection(reference_lines)
    
    # Display duplicates to the user
    if duplicates:
        print("Duplicates found:")
        for dup in duplicates:
            print(dup)
        
        # Ask user if they want to delete duplicates
        confirm = input("\nDo you want to delete these duplicates from the target file? (yes/no): ").lower()
        if confirm == 'yes':
            # Remove duplicates from target lines
            target_lines.difference_update(duplicates)
            # Write updated target lines back to the file
            write_to_file(target_file, target_lines)
            print(f"Duplicates removed from {target_file}.")
        else:
            print("No changes made to the target file.")
    else:
        print("No duplicates found.")

# Example usage
if __name__ == "__main__":
    target_file = 'DOIs without PubMed ids.txt'      # File from which to remove duplicates
    reference_file = 'PDFs_Found.txt'  # File containing lines to be checked for duplicates
    remove_duplicates_from_file(target_file, reference_file)


No duplicates found.


In [3]:
""" Get Article Details by DOIs """
import requests
import openpyxl

def get_paper_details(doi):
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        
        # Safely extract the title
        titles = data['message'].get('title', [])
        title = titles[0] if titles else 'Title not found'
        
        # Safely extract the year
        year = data['message'].get('published-print', {}).get('date-parts', [[None]])[0][0] or \
               data['message'].get('published-online', {}).get('date-parts', [[None]])[0][0]
        
        year = year if year else 'Year not found'
        return title, year
    else:
        return "Title not found", "Year not found"

def get_details_from_dois(doi_list):
    details = []
    for doi in doi_list:
        doi = doi.strip()  # Remove any extra spaces or newlines
        if doi:  # Ensure the DOI is not empty
            title, year = get_paper_details(doi)
            details.append([doi, title, year])
    return details

def read_dois_from_file(file_path):
    with open(file_path, 'r') as file:
        dois = file.readlines()
    return dois

def save_to_excel(details, output_file):
    # Create an Excel workbook and select the active worksheet
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    
    # Write headers
    sheet.append(["DOI", "Title", "Year"])
    
    # Write the details row by row
    for detail in details:
        sheet.append(detail)
    
    # Save the workbook to the specified file
    workbook.save(output_file)

if __name__ == "__main__":
    # Path to your text file containing DOIs
    file_path = "Downloaded_DOIs_New.txt"
    
    # Output Excel file path
    output_file = "doi_titles_years.xlsx"
    
    # Read DOIs from the file
    dois = read_dois_from_file(file_path)
    
    # Fetch titles and publication years
    details = get_details_from_dois(dois)
    
    # Save the information to an Excel file
    save_to_excel(details, output_file)
    
    print(f"Details saved to {output_file}")


Details saved to doi_titles_years.xlsx


In [1]:
""" Get DOIs by Article Titles """
import requests
from tqdm import tqdm
from time import sleep

# Read the list of article titles from a file
with open('titles.txt', 'r') as file:
    titles = [line.strip() for line in file]

# Prepare output files for found and not found DOIs
output_file = 'found_dois.txt'
not_found_file = 'not_found_titles.txt'

print('\nSearching DOIs by titles...')

for title in tqdm(titles):
    try:
        # Query CrossRef API with the title
        response = requests.get(
            'https://api.crossref.org/works',
            params={'query.title': title, 'rows': 1},
            headers={'User-Agent': 'DOI Finder Script'}
        )
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve DOI for '{title}', status code: {response.status_code}")
            raise Exception("Invalid response from CrossRef API")

        data = response.json()

        # Check if any results were returned
        if data['message']['items']:
            doi = data['message']['items'][0].get('DOI', 'No DOI found')
            print(f"Found DOI for '{title}': {doi}")

            # Save the DOI and title to the output file
            with open(output_file, 'a') as found_file:
                found_file.write(f"{title}\t{doi}\n")
        else:
            print(f"No DOI found for '{title}'")
            with open(not_found_file, 'a') as not_found:
                not_found.write(f"{title}\n")

    except Exception as e:
        print(f"Error for title '{title}': {str(e)}")
        with open(not_found_file, 'a') as not_found:
            not_found.write(f"{title}\n")

    # Pause to avoid rate-limiting
    sleep(1)



Searching DOIs by titles...


  0%|          | 0/73 [00:00<?, ?it/s]

Found DOI for 'Cloning of 19kDa fibrinogen-binding protein gene from Staphylococcus aureus': 10.1111/j.1365-2958.1994.tb01046.x


  1%|▏         | 1/73 [00:03<04:31,  3.77s/it]

Found DOI for 'Toll-Like Receptor 2-Dependent Inhibition of Macrophage Class II MHC Expression and Antigen Processing by 19-kDa Lipoprotein of Mycobacterium tuberculosis9': 10.4049/jimmunol.167.2.910


  3%|▎         | 2/73 [00:07<04:17,  3.63s/it]

Found DOI for 'The 38-kDa Antigen of Mycobacterium tuberculosis Induced in Vitro Potent Activation of Human Monocytes3': 10.1016/s0962-8479(97)90019-8


  4%|▍         | 3/73 [00:09<03:41,  3.16s/it]

Found DOI for 'Two novel arginine catabolic mobile elements in CA-MRSA': 10.1093/jac/dks157


  5%|▌         | 4/73 [00:12<03:24,  2.96s/it]

Found DOI for 'Structural Basis of Drug Export by the Escherichia coli RND Efflux Pump AcrB6': 10.21203/rs.3.rs-4227351/v1


  7%|▋         | 5/73 [00:14<03:08,  2.77s/it]

Found DOI for 'Overexpression of Resistance-Nodulation-Cell Division Pump AdeFGH Confers Multidrug Resistance in Acinetobacter baumannii': 10.1128/aac.00155-10


  8%|▊         | 6/73 [00:17<02:54,  2.61s/it]

Found DOI for 'Characterization of Antivirulence Compounds Inhibiting mART Toxins (2011)': 10.1128/aac.01164-10


 10%|▉         | 7/73 [00:19<02:44,  2.50s/it]

Found DOI for 'Parallel Epidemics of Community-Associated MRSA USA300': 10.1093/ofid/ofx163.200


 11%|█         | 8/73 [00:21<02:29,  2.30s/it]

Found DOI for 'The Acinetobacter trimeric autotransporter adhesin Ata': 10.1080/21505594.2018.1558693


 12%|█▏        | 9/73 [00:23<02:23,  2.24s/it]

Found DOI for 'Small-Molecule Transport by CarO, an Abundant Eight-Stranded ÃŽÂ²-Barrel Outer Membrane Protein from Acinetobacter baumannii': 10.1016/j.jmb.2015.03.016


 14%|█▎        | 10/73 [00:28<03:05,  2.95s/it]

Found DOI for 'Ceftaroline activity against pathogens associated with complicated skin infections': 10.1093/jac/dkq252


 15%|█▌        | 11/73 [00:30<02:47,  2.69s/it]

Found DOI for 'Crystal Structure of Escherichia coli Cytidine Triphosphate Synthetase, a Nucleotide-Regulated Glutamine Amidotransferase/ATP-Dependent Amidoligase Fusion Protein': 10.1021/bi0496945


 16%|█▋        | 12/73 [00:33<02:47,  2.75s/it]

Found DOI for 'Electrostatic contributions in Staphylococcus aureus protein Efb-C': 10.1110/ps.036624.108


 18%|█▊        | 13/73 [00:35<02:33,  2.57s/it]

Found DOI for 'Biochemical and Immunochemical Studies of Proteolytic Fragments of Exotoxin A from Pseudomonas aeruginosa': 10.1111/j.1432-1033.1990.tb19238.x


 19%|█▉        | 14/73 [00:37<02:27,  2.50s/it]

Found DOI for 'Cloning, nucleotide sequence, and expression in Escherichia coli of the exotoxin A structural gene of Pseudomonas aeruginosa': 10.1073/pnas.81.9.2645


 21%|██        | 15/73 [00:40<02:27,  2.54s/it]

Found DOI for 'The nature and character of the transition state for the ADP-ribosyltransferase reaction': 10.1038/embor.2008.90


 22%|██▏       | 16/73 [00:42<02:18,  2.44s/it]

Found DOI for 'The crystal structure of Pseudomonas aeruginosa exotoxin domain III with nicotinamide and AMP': 10.2210/pdb1dma/pdb


 23%|██▎       | 17/73 [00:45<02:21,  2.53s/it]

Found DOI for 'Extracellular Fibrinogen-binding Protein from Staphylococcus aureus': 10.1128/iai.57.8.2358-2363.1989


 25%|██▍       | 18/73 [00:48<02:29,  2.72s/it]

Found DOI for 'Evidence for three different fibrinogen-binding proteins with unique properties from Staphylococcus aureus strain Newman': 10.1016/0882-4010(92)90047-r


 26%|██▌       | 19/73 [00:51<02:38,  2.93s/it]

Found DOI for 'Functional Characterization of AAA Family FtsH Protease of Mycobacterium tuberculosis': 10.1111/j.1574-6968.2006.00251.x


 27%|██▋       | 20/73 [00:53<02:23,  2.71s/it]

Found DOI for 'Molecular Cloning of Human GDP-mannose 4,6-Dehydratase': 10.1074/jbc.273.14.8193


 29%|██▉       | 21/73 [00:55<02:10,  2.51s/it]

Found DOI for 'Molecular and evolutionary relationships among enteric bacteria': 10.1099/00221287-137-8-1911


 30%|███       | 22/73 [00:58<02:04,  2.43s/it]

Found DOI for 'Mutated Response Regulator graR in Staphylococcus aureus and its Role in Vancomycin Resistance': 10.1128/aac.00534-07


 32%|███▏      | 23/73 [01:00<02:01,  2.42s/it]

Found DOI for 'Vaccine assembly from surface proteins of Staphylococcus aureus': 10.1073/pnas.0606863103


 33%|███▎      | 24/73 [01:03<02:06,  2.57s/it]

Found DOI for 'Solution Structure and Molecular Determinants of Hemoglobin Binding of the First NEAT Domain of IsdB in Staphylococcus aureus': 10.2210/pdb2moq/pdb


 34%|███▍      | 25/73 [01:06<02:08,  2.68s/it]

Found DOI for 'MdfA, an Escherichia coli Multidrug Resistance Protein with a Substrate Preference for Cationic Drugs4': 10.1038/cr.2015.94


 36%|███▌      | 26/73 [01:08<01:59,  2.53s/it]

Found DOI for 'Identification and Characterization of the BaeSR Two-Component Regulatory System Required for Multidrug Resistance in Escherichia coli5': 10.1128/aem.02271-07


 37%|███▋      | 27/73 [01:11<01:58,  2.58s/it]

Found DOI for 'Novel Pseudo-SCCmec in MRSA Strain WA-MRSA-59 (2015)': 10.1007/s10096-011-1243-9


 38%|███▊      | 28/73 [01:13<01:50,  2.47s/it]

Found DOI for 'Structure of the K-12 O Antigen and rfb Sequence1': 10.1128/jb.176.13.4144-4156.1994


 40%|███▉      | 29/73 [01:15<01:43,  2.36s/it]

Found DOI for 'Genetic Variation at the O-Antigen Biosynthetic Locus in Pseudomonas aeruginosa2': 10.1128/jb.184.13.3614-3622.2002


 41%|████      | 30/73 [01:17<01:37,  2.26s/it]

Found DOI for 'Molecular characterization of a 17-kDa outer-membrane protein from Klebsiella pneumoniae': 10.1016/s0923-2508(97)87644-9


 42%|████▏     | 31/73 [01:19<01:35,  2.27s/it]

Found DOI for 'Crystal structure and functional characterization of OmpK36, the osmoporin of Klebsiella pneumoniae': 10.1016/s0969-2126(99)80055-0


 44%|████▍     | 32/73 [01:23<01:52,  2.74s/it]

Found DOI for 'Analysis of two gene regions involved in the expression of the imipenem-specific, outer membrane porin protein OprD of Pseudomonas aeruginosa': 10.1016/0378-1097(92)90347-q


 45%|████▌     | 33/73 [01:26<01:52,  2.82s/it]

Found DOI for 'Solution State NMR Structure and Dynamics of KpOmpA, a 210 Residue Transmembrane Domain Possessing a High Potential for Immunological Applications': 10.1016/j.jmb.2008.10.021


 47%|████▋     | 34/73 [01:30<01:57,  3.02s/it]

Found DOI for 'Chromosomal sequencing using a PCR-based biotin-capture method allowed isolation of the complete gene for the outer membrane protein A of Klebsiella pneumoniae': 10.1016/s0378-1119(98)00060-2


 48%|████▊     | 35/73 [01:33<01:58,  3.13s/it]

Found DOI for 'Molecular nature of methicillin-resistant Staphylococcus aureus derived from explosive nosocomial outbreaks of the 1980s in Japan': 10.1016/j.febslet.2006.03.049


 49%|████▉     | 36/73 [01:36<01:52,  3.04s/it]

Found DOI for 'Conformational dynamics in penicillin-binding protein 2a of methicillin-resistant Staphylococcus aureus, allosteric communication network and enablement of catalysis': 10.1021/jacs.6b12565.s002


 51%|█████     | 37/73 [01:39<01:47,  3.00s/it]

Found DOI for 'Characterization of MRSA with increased MICs of ceftaroline': 10.1093/jac/dks069


 52%|█████▏    | 38/73 [01:41<01:39,  2.83s/it]

Found DOI for 'Molecular cloning and nucleotide sequence determination of the regulator region of mecA gene in methicillin-resistant Staphylococcus aureus': 10.1016/0014-5793(92)80039-j


 53%|█████▎    | 39/73 [01:44<01:36,  2.84s/it]

Found DOI for 'PepN as the major aminopeptidase in E. coli': 10.1016/0378-1119(86)90366-5


 55%|█████▍    | 40/73 [01:46<01:25,  2.60s/it]

Found DOI for 'Mycobacterium smegmatis Expressing the Species-Specific Mycobacterium tuberculosis pks15/1 Gene Produces a Novel Glycolipid8': 10.1016/j.ijmyco.2016.10.033


 56%|█████▌    | 41/73 [01:49<01:24,  2.63s/it]

Found DOI for 'Structural basis of lipid targeting and destruction by the Type V secretion system of Pseudomonas aeruginosa': 10.1016/j.jmb.2016.03.012


 58%|█████▊    | 42/73 [01:52<01:22,  2.66s/it]

Found DOI for 'Sequence and Transcriptional Start of Porin F Gene in P. aeruginosa (1988)': 10.1128/jb.170.1.155-162.1988


 59%|█████▉    | 43/73 [01:54<01:14,  2.47s/it]

Found DOI for 'Nucleotide Sequence of the Protein D2 Gene of Pseudomonas aeruginosa': 10.1128/aac.36.8.1791


 60%|██████    | 44/73 [01:56<01:09,  2.41s/it]

Found DOI for 'Identification of the Catalytic Triad of the Protein D2 Protease in Pseudomonas aeruginosa': 10.1006/bbrc.1998.8745


 62%|██████▏   | 45/73 [01:58<01:05,  2.35s/it]

Found DOI for 'Protein D2 channel of the Pseudomonas aeruginosa outer membrane has a binding site for basic amino acids and peptides': 10.1016/s0021-9258(18)55452-1


 63%|██████▎   | 46/73 [02:01<01:08,  2.55s/it]

Found DOI for 'Protein D2 porin of the Pseudomonas aeruginosa outer membrane bears the protease activity': 10.1016/0014-5793(96)00945-3


 64%|██████▍   | 47/73 [02:04<01:04,  2.49s/it]

Found DOI for 'Mutagenesis of Pseudomonas exotoxin in identification of sequences responsible for the animal toxicity': 10.1016/s0021-9258(17)46223-5


 66%|██████▌   | 48/73 [02:06<01:02,  2.51s/it]

Found DOI for 'The alpha 2-macroglobulin receptor/low density lipoprotein receptor-related protein binds and internalizes Pseudomonas exotoxin A': 10.1016/s0021-9258(18)42291-0


 67%|██████▋   | 49/73 [02:09<01:01,  2.56s/it]

Found DOI for 'Active Site of Pseudomonas aeruginosa Exotoxin A: Glutamic Acid 553 is Photolabeled by NAD and Shows Functional Homology with Glutamic Acid 148 of Diphtheria Toxin': 10.1016/s0021-9258(18)47472-8


 68%|██████▊   | 50/73 [02:11<00:56,  2.48s/it]

Found DOI for 'Crystal structure of the catalytic domain of Pseudomonas exotoxin A complexed with a NAD analog': 10.1073/pnas.93.14.6902


 70%|██████▉   | 51/73 [02:14<00:58,  2.65s/it]

Found DOI for 'Mycobacterium leprae RecA is structurally analogous but functionally distinct from Mycobacterium tuberculosis RecA protein': 10.1016/j.bbapap.2011.09.011


 71%|███████   | 52/73 [02:17<00:58,  2.80s/it]

Found DOI for 'Mycobacterium tuberculosis Rv3802c (HsaD) Is a Salicylate Synthase Which Catalyzes the Conversion of Chorismate to Salicylate7': 10.2210/pdb2o11/pdb


 73%|███████▎  | 53/73 [02:20<00:54,  2.73s/it]

Found DOI for 'Structural Comparison of Staphylococcal Cassette Chromosome mec in Methicillin-Resistant Staphylococcus aureus': 10.15395/mkb.v42n4.28


 74%|███████▍  | 54/73 [02:24<00:58,  3.07s/it]

Found DOI for 'The emergence and evolution of methicillin-resistant Staphylococcus aureus': 10.1016/s0966-842x(01)02175-8


 75%|███████▌  | 55/73 [02:26<00:50,  2.80s/it]

Found DOI for 'Insights on antibiotic resistance of Staphylococcus aureus from its whole genome: genomic island SCC': 10.1016/s1368-7646(03)00003-7


 77%|███████▋  | 56/73 [02:29<00:50,  2.95s/it]

Found DOI for 'CA-MRSA SCCmec Type IVA in South Korea (2007)': 10.1136/vr.161.1.35


 78%|███████▊  | 57/73 [02:31<00:43,  2.74s/it]

Found DOI for 'Genetic diversity of MRSA carrying type IV SCCmec in Sweden': 10.1093/jac/dkn435


 79%|███████▉  | 58/73 [02:34<00:39,  2.65s/it]

Found DOI for 'Multiplex PCR for SCCmec Typing in MRSA (2009)': 10.1016/s0924-8579(07)71142-1


 81%|████████  | 59/73 [02:36<00:35,  2.53s/it]

Found DOI for 'Novel SCCmec Composite Island with Arginine Catabolic Mobile Element in MRSA (2013)': 10.1093/jac/dky399


 82%|████████▏ | 60/73 [02:39<00:33,  2.55s/it]

Found DOI for 'Novel Staphylococcal Cassette Chromosome mec Type VIII in MRSA (2009)': 10.1385/1-59745-468-0:87


 84%|████████▎ | 61/73 [02:41<00:29,  2.45s/it]

Found DOI for 'The Staphylococcal Superantigen-Like Protein 7 Binds IgA and Complement C5 and Inhibits IgA-FcÎ±RI Binding and Serum Killing of Bacteria10': 10.4049/jimmunol.174.5.2926


 85%|████████▍ | 62/73 [02:44<00:28,  2.55s/it]

Found DOI for 'Two variants of SCC mec type IVA in CA-MRSA strains in South Korea': 10.1099/jmm.0.009688-0


 86%|████████▋ | 63/73 [02:46<00:24,  2.46s/it]

Found DOI for 'Identification, cloning and sequence of the Streptococcus faecium infB (translational initiation factor IF2) gene': 10.1007/bf00330501


 88%|████████▊ | 64/73 [02:48<00:21,  2.41s/it]

Found DOI for 'Structural Basis of Type 2 Secretion System in P. aeruginosa (2017)': 10.1016/j.jmb.2016.03.012


 89%|████████▉ | 65/73 [02:51<00:19,  2.39s/it]

Found DOI for 'Novel Type of Staphylococcal Cassette Chromosome mec in Community-Acquired Methicillin-Resistant Staphylococcus aureus': 10.21101/cejph.a4979


 90%|█████████ | 66/73 [02:54<00:17,  2.54s/it]

Found DOI for 'Genome and virulence determinants of high virulence community-acquired MRSA': 10.1016/s0140-6736(02)08713-5


 92%|█████████▏| 67/73 [02:56<00:15,  2.53s/it]

Found DOI for 'VanN-Type Transferable Vancomycin Resistance in Enterococcus faecium': 10.1128/aac.00714-11


 93%|█████████▎| 68/73 [02:58<00:12,  2.44s/it]

Found DOI for 'Identification of VanN-Type Vancomycin Resistance in Enterococcus faecium from Chicken Meat in Japan': 10.1128/aac.00747-12


 95%|█████████▍| 69/73 [03:01<00:10,  2.66s/it]

Found DOI for 'Xcp-mediated protein secretion in Pseudomonas aeruginosa': 10.1007/978-3-642-73184-6_29


 96%|█████████▌| 70/73 [03:03<00:07,  2.48s/it]

Found DOI for 'New Insights into the Assembly of Bacterial Secretins': 10.1074/jbc.m112.432096


 97%|█████████▋| 71/73 [03:06<00:05,  2.53s/it]

Found DOI for 'Dissection of homologous translocon operons reveals role for YopD in Yersinia pseudotuberculosis': 10.1099/mic.0.26322-0


 99%|█████████▊| 72/73 [03:08<00:02,  2.45s/it]

Found DOI for 'The Response of Acinetobacter baumannii to Zinc Starvation': 10.1016/j.chom.2016.05.007


100%|██████████| 73/73 [03:11<00:00,  2.62s/it]
