## Mounting Google Drive

The best way to get input text to-be-trained into the Colaboratory VM, and to get the trained model *out* of Colaboratory, is to route it through Google Drive *first*.

Running this cell (which will only work in Colaboratory) will mount your personal Google Drive in the VM, which later cells can use to get data in/out. (it will ask for an auth code; that auth is not saved anywhere)

In [5]:
from google.colab import drive
import os

drive.mount('/content/gdrive')  # Mounting GoogleDrive to the content folder


Mounted at /content/gdrive


Change Working Directory

In [6]:
os.chdir('gdrive/MyDrive/NLP_scientific-text-generation/')

### Preparation of Data Table With Journal Title and Paper ID

(You can skip this part if you ran it already once; just jump to the next part to simply import the table again.)



In [None]:
import os
import json

titles = []
ids = []
lang = []

for f_name in os.listdir("./json"):
  if f_name.endswith(".json"):
    path = os.path.join(dir, f_name)
    with open(path, "r") as f:
      data = json.load(f)
      for obj in data["metadata"]:
        if obj["key"] == "dc.identifier.citation":
          titles.append(obj["value"].split('|')[1][1:-1])
          ids.append(f_name[:-5])
        if obj["key"] == "dc.type":
          lang.append(obj["language"])

print(titles)
print(ids)
print(lang)

Save Results in a Numpy Table and Store on GDrive

In [None]:
import numpy as np
journal_table = np.column_stack((titles, ids, lang))
journal_table.tofile('prep/journal_table.npd')
journal_table

## Read and Prepare Journal Data Table

Read Data Table

In [11]:
import numpy as np
journal_table = np.fromfile('prep/journal_table.npd', dtype=np.dtype([('journal', '<U130'), ('id', '<U130')]))

journal_table

array([('CES Working Papers', '10419-198372'),
       ('Foresight and STI Governance', '10419-210554'),
       ('CES Working Papers', '10419-198520'), ...,
       ('International Journal of Financial Studies', '10419-195657'),
       ('Journal of Economic Structures', '10419-194884'),
       ('Logistics Research', '10419-157728')],
      dtype=[('journal', '<U130'), ('id', '<U130')])

Print Frequency Table

In [None]:
import pandas as pd

my_tab = pd.crosstab(index=journal_table[:]['journal'],  # Make a crosstab
                              columns="count")      # Name the count column

pd.set_option('display.max_rows', 1000)

tab = my_tab['count'].sort_values(ascending=False)

print(tab)


Create ID Lists of the Fulltexts for Each Journal

In [12]:
#Amfiteatru Economic Journal    541
#CES Working Papers             516

AEJ = []
CES = []

for entry in journal_table:
  if entry[0] == "Amfiteatru Economic Journal":
    AEJ.append(entry[1])
  elif entry[0] == "CES Working Papers":
    CES.append(entry[1])

print(AEJ)
print(CES)

['10419-196454', '10419-196474', '10419-169092', '10419-196404', '10419-196470', '10419-169067', '10419-196475', '10419-196481', '10419-196450', '10419-169098', '10419-169093', '10419-196461', '10419-169103', '10419-169076', '10419-196436', '10419-169099', '10419-196429', '10419-196411', '10419-196427', '10419-196473', '10419-196418', '10419-169075', '10419-196403', '10419-196408', '10419-196453', '10419-196414', '10419-169111', '10419-196420', '10419-196440', '10419-169100', '10419-169095', '10419-169074', '10419-196465', '10419-169097', '10419-196417', '10419-196431', '10419-196459', '10419-196441', '10419-196446', '10419-196467', '10419-196430', '10419-196464', '10419-169088', '10419-169083', '10419-169110', '10419-196428', '10419-196422', '10419-196468', '10419-169104', '10419-196405', '10419-196410', '10419-196425', '10419-196449', '10419-196423', '10419-169070', '10419-196413', '10419-196407', '10419-196421', '10419-169101', '10419-169109', '10419-196442', '10419-196480', '10419-

## Create Directories to Save the Prepared Fulltexts



In [13]:
paper_dir = ['AEJ', 'CES']
for p_dir in paper_dir:
  prep_dir = 'prep/'+p_dir
  print(prep_dir)
  if not os.path.exists(prep_dir):  # Create folder only if does not already exist
    os.makedirs(prep_dir)

prep/AEJ
prep/CES


In [7]:
# If necessary the following code can be used to delete specific folders and the files included
# USE WITH CARE

#import shutil
#shutil.rmtree('prep/'+'AEJ')

## Helper Functions to Prepare the Fulltexts

In [24]:
import re

def CES_remove_pagebreaks (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("\f", lines[i])!=None):
      line_deletions.append(i)
  for l in line_deletions:
    del lines[(l-4):(l+1)]
  return lines

def CES_remove_authors_pagebreak (lines):
  line_deletions = []
  for line in range(0, len(lines)-1, 1):
    # Searching for the beginning of the author info (marked by a '*')
    if (re.search("^\*[^\*]", lines[line])!=None):
      line_deletions.append(line-1)
    # In case the '*' was not recognized as is but as an unknown character
    if (re.search("^\uf02a\n$", lines[line])!=None):
      if (len(line_deletions)==0):
        line_deletions.append(line-2)
    # At the end of the author info often follows the license info
    if (re.search("This work is licensed under a Creative Commons Attribution License\n", lines[line])!=None):
      line_deletions.append(line+6)
      break
    # If no license info is found, the next pafe forward character is used as the end of the author info
    if (re.search("\f", lines[line])!=None):
      if (len(line_deletions)==0):  # Older version: without license and author info (nothing will be added to line_deletions)
        break;
      line_deletions[0] += -2
      if (lines[line+1] == "\n"):
        line += 1
      line_deletions.append(line)
      break
  # Deletion of the lines including the author (and license) inf (if identified)
  if len(line_deletions)>0:
    del lines[(line_deletions[0]):(line_deletions[1])]
  return lines

def CES_remove_title_page (lines):
  line_deletions = []
  for i in range(0, len(lines)-1, 1):
    if (re.search("^Keywords: ", lines[i])!=None):
      line_deletions.append(i)
      break
  if len(line_deletions)>0:
    del lines[0:(line_deletions[0]+2)]
  return lines

def AEJ_remove_title_page (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("Corresponding author", lines[i])!=None):
      line_deletions.append(i)
  if len(line_deletions)>0:
    del lines[0:line_deletions[len(line_deletions)-1]+2]
  return lines

def AEJ_remove_pagebreaks (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("\f", lines[i])!=None):
      line_deletions.append(i)
  for l in line_deletions:
    del lines[(l-4):(l+5)]
  return lines

def remove_bibliography (lines):
  line_deletions = []
  for i in range(len(lines)-1, -1, -1):
    if (re.search("References\n", lines[i])!=None):
      line_deletions.append(i)
    if (re.search("Bibliography\n", lines[i])!=None):
      line_deletions.append(i)
  if len(line_deletions)>0:
    del lines[line_deletions[len(line_deletions)-1]:-1]
  return lines

## Preparation of Fulltexts of Amfiteatru Economic Journal (AEJ)

In [30]:
for id in AEJ:

  # read text
  with open('text/'+id+'.txt') as f:
    lines = f.readlines()

  # prepare text
  prep_lines = AEJ_remove_title_page(lines)
  prep_lines = AEJ_remove_pagebreaks(prep_lines)
  prep_lines = remove_bibliography(prep_lines)

  # save text to prep folder
  f = open('prep/'+'AEJ/'+id+'.txt', 'a')
  f.writelines(prep_lines)
  f.close()

In [28]:
for id in CES:
  
  # read text
  with open('text/'+id+'.txt') as f:
    lines = f.readlines()

  # prepare text
  prep_lines = CES_remove_authors_pagebreak(lines)
  prep_lines = CES_remove_title_page(prep_lines)
  prep_lines = CES_remove_pagebreaks(prep_lines)
  prep_lines = remove_bibliography(prep_lines)

  # save text to prep folder
  f = open('prep/'+'CES/'+id+'.txt', 'a')
  f.writelines(prep_lines)
  f.close()

## Selection of Fulltexts on Climate Change

Import ID List

In [41]:
id_list = pd.read_csv("./prep/IDs_climate-change-papers.csv") 
id_list

Unnamed: 0,Journal,ID
0,Agricultural and Food Economics,10419-179045
1,Agricultural and Food Economics,10419-179051
2,Agricultural and Food Economics,10419-186035
3,Amfiteatru Economic Journal,10419-168672
4,Amfiteatru Economic Journal,10419-167689
...,...,...
188,,10419-174591
189,,10419-179670
190,,10419-182388
191,,10419-197071


Create Seperate Folder for Selected Fulltexts

In [61]:
d_name = "prep/climate-change-papers"
if not os.path.exists(d_name):  # Create folder only if does not already exist
  os.makedirs(d_name)

Select and Save Fulltexts from the ID List 

In [59]:
# If necessary the following code can be used to delete specific folders and the files included
# USE WITH CARE

#import shutil
#shutil.rmtree('prep/climate-change-papers/')

In [62]:
from shutil import copyfile

d_journal = ["prep/AEJ/", "prep/CES/"]
dst = "prep/climate-change-papers/"

# Check all fulltexts of the directories for the corresponding journals
for src in d_journal:
  for f_name in os.listdir(src):
    # Check whether the fulltect ID corresponds to one in the ID list
    if id_list['ID'].str.contains(f_name[:-4]).any():
      copyfile(src+f_name, dst+f_name)

Zip Folder with Selected Files

In [63]:
from shutil import make_archive

make_archive("prep/climate-change-papers", 'zip', "prep/climate-change-papers/")

'/content/gdrive/My Drive/NLP_scientific-text-generation/prep/climate-change-papers.zip'