In [9]:
from pymongo import MongoClient
from semanticscholar import SemanticScholar
from tqdm import tqdm

In [10]:
client = MongoClient("mongodb://127.0.0.1:27017/")
db = client["semantic_scholar"]

In [11]:
collection = db["papers"]

In [13]:
params = {
    'query': '',
    'fields': [], # [] if all
    'fields_of_study': [], # [] if all
    'year': None, # None if all
    'limit': 100
}

In [14]:
def get_paper_overalls(sch, query, year, limit=100, fields_of_study=None, fields=None):
    local_params = {
        'query': query,
        'limit': limit
    }
    if limit >= 100:
        local_params['limit'] = 100
    if year is not None:
        local_params['year'] = year
    if fields_of_study is not None:
        local_params['fields_of_study'] = fields_of_study
    if fields is not None:
        local_params['fields'] = fields
    
    results = sch.search_paper(**local_params)
    return [results, len(results)]

In [15]:
def main(local_params, how_many=250):
    paper_overalls = list()
    sch = SemanticScholar()
    sch.timeout = round(how_many*0.0625 + 1) + 2
    package = get_paper_overalls(sch=sch, **local_params)
    sch_iter = package[0]
    content = sch_iter.items
    paper_overalls.append(content)
    count = package[1]
    with tqdm(total=how_many) as pbar:
        pbar.update(count)
        if count < how_many:
            while count < how_many:
                try:
                    sch_iter.next_page()
                    content = sch_iter.items
                    if type(content[0]) != list:
                        paper_overalls.append(content[0])
                    else:
                        paper_overalls += content[0]
                    count += package[1]
                    pbar.update(package[1])
                except Exception as e:
                    print(e)
                    print(package[0])
                    break
    return papers_to_dicts(paper_overalls[0])

In [16]:
def papers_to_dicts(p_list):
    d_list = list()
    for p in p_list:
        p_dict =p.__dict__['_data']
        d_list.append(p_dict)

    return d_list

In [19]:
ai_topics = [
    "Genetics and AI", "Physics and AI", "Neuroscience and AI", "Astronomy and AI",
    "Chemistry and AI", "Ecology and AI", "Mathematics and AI", "Geology and AI",
    "Bioinformatics and AI", "Psychology and AI", "Biology and AI", "Anthropology and AI",
    "Medicine and AI", "Sociology and AI", "Economics and AI", "Linguistics and AI",
    "History and AI", "Geography and AI", "Engineering and AI", "Cybersecurity and AI",
    "Robotics and AI", "Genomics and AI", "Climatology and AI", "Finance and AI",
    "Literary Studies and AI", "Philosophy and AI", "Archaeology and AI", "Nanotechnology and AI",
    "Political Science and AI", "Music and AI", "Gender Studies and AI", "Cosmology and AI",
    "Pedagogy and AI", "Informatics and AI", "Ethology and AI", "Marketing and AI",
    "Pharmacology and AI", "Rhetoric and AI", "Theology and AI", "Design and AI",
    "Psychiatry and AI", "Architecture and AI", "Jurisprudence and AI", "Genetic Engineering and AI",
    "Psychophysiology and AI", "Astrophysics and AI", "Econometrics and AI", "Cultural Studies and AI",
    "Biochemistry and AI", "Sports Sciences and AI"
]

In [20]:
result_all = list()

In [21]:
for topic in ai_topics:
    params['query'] = topic
    result = main(params, how_many=1000)
    result_all += result

100%|██████████| 1000/1000 [02:35<00:00,  6.45it/s]
100%|██████████| 1000/1000 [02:19<00:00,  7.18it/s]
100%|██████████| 1000/1000 [03:25<00:00,  4.86it/s]
 30%|███       | 300/1000 [00:04<00:09, 74.94it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46B8F9A0>


100%|██████████| 1000/1000 [01:43<00:00,  9.66it/s]
 90%|█████████ | 900/1000 [02:39<00:17,  5.64it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46A04A90>


100%|██████████| 1000/1000 [04:10<00:00,  3.99it/s]
100%|██████████| 1000/1000 [01:57<00:00,  8.50it/s]
 90%|█████████ | 900/1000 [01:54<00:12,  7.87it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46CB3FA0>


100%|██████████| 1000/1000 [04:12<00:00,  3.96it/s]
100%|██████████| 1000/1000 [02:32<00:00,  6.57it/s]
 50%|█████     | 500/1000 [02:44<02:44,  3.04it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E472A9490>


100%|██████████| 1000/1000 [02:40<00:00,  6.24it/s]
 80%|████████  | 800/1000 [01:19<00:19, 10.01it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E495E5460>


100%|██████████| 1000/1000 [06:45<00:00,  2.46it/s]
100%|██████████| 1000/1000 [03:02<00:00,  5.48it/s]
100%|██████████| 1000/1000 [06:05<00:00,  2.74it/s]
100%|██████████| 1000/1000 [02:58<00:00,  5.60it/s]
100%|██████████| 1000/1000 [02:56<00:00,  5.66it/s]
100%|██████████| 1000/1000 [04:29<00:00,  3.70it/s]
100%|██████████| 1000/1000 [03:27<00:00,  4.82it/s]
100%|██████████| 1000/1000 [03:35<00:00,  4.64it/s]
 20%|██        | 200/1000 [00:32<02:09,  6.19it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E495E5AC0>


100%|██████████| 1000/1000 [02:52<00:00,  5.79it/s]
 40%|████      | 400/1000 [00:36<00:54, 10.94it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46CB3D00>


100%|██████████| 1000/1000 [02:02<00:00,  8.17it/s]
 50%|█████     | 500/1000 [01:09<01:09,  7.16it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E4982C1C0>


 50%|█████     | 500/1000 [00:39<00:39, 12.66it/s] 


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E490FE7F0>


 70%|███████   | 700/1000 [01:28<00:38,  7.89it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E495E5160>


100%|██████████| 1000/1000 [01:28<00:00, 11.25it/s]
100%|██████████| 1000/1000 [02:22<00:00,  7.02it/s]
 20%|██        | 200/1000 [01:04<04:18,  3.10it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E469F7820>


100%|██████████| 1000/1000 [04:11<00:00,  3.97it/s]
 20%|██        | 200/1000 [00:34<02:17,  5.82it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E4C204460>


  5%|▍         | 49/1000 [00:00<?, ?it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E490FE4F0>


100%|██████████| 1000/1000 [02:26<00:00,  6.81it/s]
100%|██████████| 1000/1000 [04:03<00:00,  4.10it/s]
 30%|███       | 300/1000 [00:35<01:21,  8.55it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E52D58B80>


 30%|███       | 300/1000 [00:04<00:09, 72.57it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E51518730>


100%|██████████| 1000/1000 [01:59<00:00,  8.35it/s]
 50%|█████     | 500/1000 [02:16<02:16,  3.67it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E4BF50A00>


100%|██████████| 1000/1000 [01:28<00:00, 11.34it/s]
 30%|███       | 300/1000 [01:36<03:44,  3.11it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E52D58970>


 60%|██████    | 600/1000 [01:13<00:48,  8.21it/s] 


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E472A99A0>


  1%|          | 11/1000 [00:00<?, ?it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46B8F520>


 20%|██        | 200/1000 [00:01<00:05, 141.78it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E490FE190>


 20%|██        | 200/1000 [00:01<00:07, 108.87it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46DAF190>


100%|██████████| 1000/1000 [01:30<00:00, 11.07it/s]
 60%|██████    | 600/1000 [04:19<02:53,  2.31it/s]


No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E495E5AF0>


 20%|██        | 200/1000 [00:32<02:11,  6.10it/s]

No more pages to fetch.
<semanticscholar.PaginatedResults.PaginatedResults object at 0x0000024E46CB3FA0>





In [22]:
len(result_all)

35246

In [23]:
collection.insert_many(result_all)

InsertManyResult([ObjectId('664cc750e916764cf4cc72aa'), ObjectId('664cc750e916764cf4cc72ab'), ObjectId('664cc750e916764cf4cc72ac'), ObjectId('664cc750e916764cf4cc72ad'), ObjectId('664cc750e916764cf4cc72ae'), ObjectId('664cc750e916764cf4cc72af'), ObjectId('664cc750e916764cf4cc72b0'), ObjectId('664cc750e916764cf4cc72b1'), ObjectId('664cc750e916764cf4cc72b2'), ObjectId('664cc750e916764cf4cc72b3'), ObjectId('664cc750e916764cf4cc72b4'), ObjectId('664cc750e916764cf4cc72b5'), ObjectId('664cc750e916764cf4cc72b6'), ObjectId('664cc750e916764cf4cc72b7'), ObjectId('664cc750e916764cf4cc72b8'), ObjectId('664cc750e916764cf4cc72b9'), ObjectId('664cc750e916764cf4cc72ba'), ObjectId('664cc750e916764cf4cc72bb'), ObjectId('664cc750e916764cf4cc72bc'), ObjectId('664cc750e916764cf4cc72bd'), ObjectId('664cc750e916764cf4cc72be'), ObjectId('664cc750e916764cf4cc72bf'), ObjectId('664cc750e916764cf4cc72c0'), ObjectId('664cc750e916764cf4cc72c1'), ObjectId('664cc750e916764cf4cc72c2'), ObjectId('664cc750e916764cf4cc72