## Mounting Google Drive

The best way to get input text to-be-trained into the Colaboratory VM, and to get the trained model *out* of Colaboratory, is to route it through Google Drive *first*.

Running this cell (which will only work in Colaboratory) will mount your personal Google Drive in the VM, which later cells can use to get data in/out. (it will ask for an auth code; that auth is not saved anywhere)

In [4]:
from google.colab import drive
import os

drive.mount('/content/gdrive')  # Mounting GoogleDrive to the content folder


Mounted at /content/gdrive


Change Working Directory

In [5]:
os.chdir('gdrive/MyDrive/NLP_scientific-text-generation/')

Prepare Data for a Table With Journal Title and Papaer ID

In [4]:
import os
import json

dir = "/json"

titles = []
ids = []
count = 100
i = 0

for f_name in os.listdir(dir):
  if f_name.endswith(".json"):
    path = os.path.join(dir, f_name)
    with open(path, "r") as f:
      data = json.load(f)
      for obj in data["metadata"]:
        if obj["key"] == "dc.identifier.citation":
          titles.append(obj["value"].split('|')[1][1:-1])
          ids.append(f_name[:-5])


    #i += 1
    #if i >= count:
    #  break

print(titles)
print(ids)

['CES Working Papers', 'Foresight and STI Governance', 'CES Working Papers', 'Historical Social Research', 'CES Working Papers', 'Economics: The Open-Access, Open-Assessment E-Journal', 'International journal for equity in health', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'Foresight and STI Governance', 'Economics: The Open-Access, Open-Assessment E-Journal', 'CES Working Papers', 'Adolescent Research Review', 'Foresight-Russia', 'DANUBE: Law, Economics and Social Issues Review', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'Foresight and STI Governance', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'CES Working Papers', 'Foresight-Russia', 'CES Working Papers', 'Foresight and STI Governance', 'IASSIST Quarterly ', 'CES Working Papers', 'Economics: The Open-Access, Open-Assessment E-Journal', 'CES Working Papers', 'CES Working Papers', 'Geographica Helvetica', 'CES Working Pap

Save Results in a Numpy Table and Store on GDrive

In [66]:
import numpy as np
journal_table = np.column_stack((titles, ids))
journal_table.tofile('prep/journal_table.npd')
journal_table

array([['CES Working Papers', '10419-198372'],
       ['Foresight and STI Governance', '10419-210554'],
       ['CES Working Papers', '10419-198520'],
       ...,
       ['International Journal of Financial Studies', '10419-195657'],
       ['Journal of Economic Structures', '10419-194884'],
       ['Logistics Research', '10419-157728']], dtype='<U130')

Import Journal Table

In [8]:
import numpy as np
journal_table = np.fromfile('prep/journal_table.npd', dtype=np.dtype([('journal', '<U130'), ('id', '<U130')]))

journal_table

array([('CES Working Papers', '10419-198372'),
       ('Foresight and STI Governance', '10419-210554'),
       ('CES Working Papers', '10419-198520'), ...,
       ('International Journal of Financial Studies', '10419-195657'),
       ('Journal of Economic Structures', '10419-194884'),
       ('Logistics Research', '10419-157728')],
      dtype=[('journal', '<U130'), ('id', '<U130')])

Prepare Frequency Table

In [73]:
import pandas as pd

t_tab = np.array(titles)
my_tab = pd.crosstab(index=journal_table[:]['journal'],  # Make a crosstab
                              columns="count")      # Name the count column

my_tab['count'].sort_values(ascending=False)

row_0
Amfiteatru Economic Journal                                                                                                           541
CES Working Papers                                                                                                                    516
Cogent Business & Management                                                                                                          427
Cogent Economics & Finance                                                                                                            265
Swiss Journal of Economics and Statistics                                                                                             215
                                                                                                                                     ... 
Journal of Youth Studies                                                                                                                1
Journal of Studies in Intern

In [11]:
#Amfiteatru Economic Journal    541
#CES Working Papers             516
#Cogent Business & Management   427

AEJ = []
CES = []
CBM = []
for entry in journal_table:
  if entry[0] == "Amfiteatru Economic Journal":
    AEJ.append(entry[1])
  elif entry[0] == "CES Working Papers":
    CES.append(entry[1])
  elif entry[0] == "Cogent Business & Management":
    CBM.append(entry[1])

print(AEJ)
print(CES)
print(CBM)

['10419-196454', '10419-196474', '10419-169092', '10419-196404', '10419-196470', '10419-169067', '10419-196475', '10419-196481', '10419-196450', '10419-169098', '10419-169093', '10419-196461', '10419-169103', '10419-169076', '10419-196436', '10419-169099', '10419-196429', '10419-196411', '10419-196427', '10419-196473', '10419-196418', '10419-169075', '10419-196403', '10419-196408', '10419-196453', '10419-196414', '10419-169111', '10419-196420', '10419-196440', '10419-169100', '10419-169095', '10419-169074', '10419-196465', '10419-169097', '10419-196417', '10419-196431', '10419-196459', '10419-196441', '10419-196446', '10419-196467', '10419-196430', '10419-196464', '10419-169088', '10419-169083', '10419-169110', '10419-196428', '10419-196422', '10419-196468', '10419-169104', '10419-196405', '10419-196410', '10419-196425', '10419-196449', '10419-196423', '10419-169070', '10419-196413', '10419-196407', '10419-196421', '10419-169101', '10419-169109', '10419-196442', '10419-196480', '10419-

In [73]:
paper_dir = ['AEJ', 'CES', 'CBM']
for p_dir in paper_dir:
  prep_dir = 'prep/'+p_dir
  print(prep_dir)
  if not os.path.exists(prep_dir):  # Create a project folder if it does not exist yet
    os.makedirs(prep_dir)

prep/AEJ
prep/CES
prep/CBM


In [68]:
import re

def remove_title_page (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("Corresponding author", lines[i])!=None):
      line_deletions.append(i)
  if len(line_deletions)>0:
    del lines[0:line_deletions[len(line_deletions)-1]+2]
  return lines

def remove_pagebreaks (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("\x0c", lines[i])!=None):
      line_deletions.append(i)
  for l in line_deletions:
    del lines[(l-4):(l+5)]
  return lines

def remove_bibliography (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("References\n", lines[i])!=None):
      line_deletions.append(i)
    if (re.search("Bibliography\n", lines[i])!=None):
      line_deletions.append(i)
  if len(line_deletions)>0:
    del lines[line_deletions[len(line_deletions)-1]:-1]
  return lines

In [74]:
count = 3
i=0

for id in AEJ:
  # read text
  with open('text/'+id+'.txt') as f:
    lines = f.readlines()
  # prepare text
  prep_lines = remove_title_page(lines)
  prep_lines = remove_pagebreaks(prep_lines)
  prep_lines = remove_bibliography(prep_lines)
  # save text to prep folder
  f = open('prep/'+'AEJ/'+id+'.txt', 'a')
  f.writelines(prep_lines)
  f.close()
  

  i += 1
  if i >= count:
    break