In [1]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install transformers



In [3]:
pip install textstat



In [4]:
pip install tqdm



In [5]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
import pandas.io.json as pd_json
from collections import Counter,defaultdict
import string
import textstat
import statistics
import pickle
import torch
from transformers import *
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm import tqdm
import difflib

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def build_affiliation_dictionary(author_university_df,university_score_df):

    interim_mapper,mapper = {},{}
    for index, row in author_university_df.iterrows():
        #name,affiliation,homepage,scholarid
        link = row['homepage']
        try:
            if "http" in link:
                base = link.split("/")[2]
            else:
                base = link.split("/")[0]
      
            if "www" in base:
                base = ".".join(base.split(".")[1:])
              
            if base in interim_mapper:
                if not row["affiliation"] in interim_mapper[base]:
                    interim_mapper[base][row["affiliation"]] = 1
                else:
                    interim_mapper[base][row["affiliation"]] += 1
            else:
                interim_mapper[base] = {}
                interim_mapper[base][row["affiliation"]] = 1
              
        except Exception as e:
            print("Homepage : ",row['homepage']," defaulted!")

    # for base in interim_mapper:
    #     if len( interim_mapper[base].keys()) > 1:
    #         print(base, interim_mapper[base])

    #interim_mapper now has the email ids, and number of times, they were refferred to as an affiliations.
    #we ignore if number of affliations are more than 4 as then they are generic ids like gmail.com and shoudn't be mapped
    #else we take the maximum

    for base in interim_mapper:
        if len(interim_mapper[base].keys()) < 2:
            #calculate the maximum referred affiliation
            max_count = 0
            max_ff = ""
            for aff in interim_mapper[base]:
                if interim_mapper[base][aff] > max_count:
                    max_count = interim_mapper[base][aff]
                    max_ff = aff
            mapper[base] = aff

    '''with open("processed_data/affiliation_dict", "wb") as output_file:
        pickle.dump(mapper, output_file)'''

    return mapper

In [0]:
def build_and_save_tfidf_model():

    #collect corpus
    corpus = []
    directory_in_string = data_path + '/iclr_2017/train/parsed_pdfs'
    directory_content = os.fsencode(directory_in_string)
    for file in tqdm(os.listdir(directory_content)):
        filename = os.fsdecode(file)
        with open(os.path.join(directory_in_string, filename),encoding="utf8") as file:
            paper_metadata = json.load(file)['metadata']
            abstract_text = paper_metadata['abstractText'].translate(punct_removal_table)
            corpus.append(abstract_text)

    vectorizer = TfidfVectorizer(analyzer='word', stop_words = stop_words)
    X = vectorizer.fit_transform(corpus)
    return X

In [8]:
#HOUSEKEEPING
stop_words = stopwords.words('english')
punct_removal_table = {ord(char): None for char in string.punctuation}

MODELS = {"BertModel" : (BertModel,       BertTokenizer,       'bert-base-uncased'),
          "OpenAIGPTModel" : (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          "GPT2Model" : (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          "TransfoXLModel" : (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          "XLNetModel" : (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          "XLMModel" : (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          "DistilBertModel" : (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          "RobertaModel" : (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          "XLMRobertaModel" : (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base')}
model_class, tokenizer_class, pretrained_weights = MODELS['BertModel']
#Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
nlp = pipeline('feature-extraction') 

#read author-affiliation file from csrankings hidden page 
author_university_df = pd.read_csv(os.path.join(data_path,"author_university_list.csv"))
#read university-research score file from csrankings main page
university_score_df = pd.read_csv(os.path.join(data_path,"csrankings.csv")).drop_duplicates(subset=['institute'])

with open(os.path.join(data_path,'WordsFromTitleofTop200Papers.txt'),encoding="utf8") as f:
    top_200_titles_words_counter = Counter([word for word in f.read().translate(punct_removal_table).lower().split() if word not in stop_words])
    #TAKE TOP 5%
    top_200_titles_vocab = [key for key,value in top_200_titles_words_counter.most_common(int(0.05*len(top_200_titles_words_counter)))]


HBox(children=(IntProgress(value=0, description='Downloading', max=230, style=ProgressStyle(description_width=…




In [9]:
email_institute_affiliation_mapper = build_affiliation_dictionary(author_university_df,university_score_df)
tfidf_matrix = build_and_save_tfidf_model()

Homepage :  http:/roseyu.com  defaulted!


100%|██████████| 349/349 [00:00<00:00, 502.59it/s]


In [0]:
def load_data_files_into_raw_df():

    directory_in_string = data_path + '/iclr_2017/train/parsed_pdfs'
    directory_content = os.fsencode(directory_in_string)
    list_of_file_dicts,paper_data_df = [],pd.DataFrame()
    file_number = 0
    for file in tqdm(os.listdir(directory_content)):
        filename = os.fsdecode(file)
        file_dict = defaultdict()
        #print("\nOpening file : ",filename)
        with open(os.path.join(directory_in_string, filename),encoding="utf8") as file:
            data = json.load(file)
            file_dict['paper_id'] = data['name']
            paper_metadata = data['metadata']

            ################################ ABSTRACT ###################################################
            #print("Extracting abstract features....")
            abstract_text = paper_metadata['abstractText'].translate(punct_removal_table).lower()
            
            #1. BERT et al encoding
            '''input_ids = torch.tensor([tokenizer.encode(abstract_text)]).unsqueeze(0) 
            outputs = model(input_ids)
            last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
            file_dict['attention_based_encoding'] = last_hidden_states[0][0]'''
            file_dict['feature_extraction_encoding'] = nlp(abstract_text)[0][0] #DistilBERT
            
            #2. TFIDF ENCODING
            file_dict['tfidf_encoding'] = tfidf_matrix.toarray()[file_number]
            file_number = file_number + 1

            #title = paper_metadata['title'].lower()
            #3. IF ABSTRACT CONTAINS ATLEAST 2 WORDS FROM TOP 200 TITLES
            if len(set(abstract_text.split()).intersection(set(top_200_titles_vocab))) > 2:
                file_dict['words_from_top_200_title'] = True
            else:
                file_dict['words_from_top_200_title'] = False
            
            #4. ABSTRACT LENTGTH
            file_dict['abstract_length'] = textstat.lexicon_count(abstract_text, removepunct=True)
            
            #5. ABSTRACT COMPLEXITY
            flesch = 1/textstat.flesch_reading_ease(abstract_text)
            dale_chall = textstat.dale_chall_readability_score(abstract_text)
            file_dict['abstract_complexity'] = (flesch + dale_chall)/2

            #6. ABSTRACT NOVELTY
            if len({'outperforms','state-of-the-art','state of the art'}.intersection(set(abstract_text.split()))) > 0:
                file_dict['abstract_novelty'] = True
            else:
                file_dict['abstract_novelty'] = False
            ################################ TITLE ###################################################

            ################################ AUTHORS ###################################################
            #print("Extracting authors features.....")
            #1. NUMBER OF AUTHORS
            reported_num_of_authors = len(paper_metadata['authors'])
            if reported_num_of_authors == 0:
                file_dict['number_of_authors'] = 2 #AVG?
            else:
                file_dict['number_of_authors'] = reported_num_of_authors
            
            #2. AUTHOR AFFILIATION SCORE
            author_emails = paper_metadata['emails']
            author_institutes = [email.split('@')[1] for email in author_emails]
            research_strength_score = 0
            if len(author_institutes) != 0:
                for institute in author_institutes:
                    closest_matches = difflib.get_close_matches(institute, email_institute_affiliation_mapper.keys())
                    if len(closest_matches) == 0:
                        #if there are no closest matches it's not from a university from our list, 
                        #which means its either a strange university or a corporate company. 
                        #either way its safe to give it a HIGH SCORE - even if the paper is not from a good source
                        #other parameters will take care of it 
                        research_strength_score = research_strength_score + 60 #DECISION!
                    else:
                        affiliated_unis_from_string_match = [email_institute_affiliation_mapper[match] for match in closest_matches]
                        sub_score = 0
                        for affiliated_uni in affiliated_unis_from_string_match:
                            try:
                                sub_score = sub_score + university_score_df.loc[university_score_df.institute ==  affiliated_uni, 'count'].values[0]
                            except Exception as e:
                                #if it's an excpetion this is a university alright, 
                                #but doesnt fall in our 100 unis of csraniking scores
                                #this means its mostly not a top uni, so it is safe to give it a LOW SCORE
                                sub_score = sub_score + 6 #DECISION!
                        closest_matches_avg_score = sub_score/len(affiliated_unis_from_string_match)
                        research_strength_score = research_strength_score + closest_matches_avg_score
                file_dict['research_strength_score'] = research_strength_score/len(author_institutes)                    
            else:
                file_dict['research_strength_score'] = 0
            ################################ AUTHORS ###################################################

            ################################ REFERENCES ###################################################
            #print("Extracting references features.....")
            references_list,ref_mentions_list = paper_metadata['references'],paper_metadata['referenceMentions']
            #1. NUM OF REFRERENCES
            file_dict['num_of_references'] = len(references_list)
            
            #2. MOST RECENT REFERENCE YEAR
            ref_years_list = [ref_dict['year'] for ref_dict in references_list]
            file_dict['most_recent_ref_year'] = max(ref_years_list)
            
            #3. AVG LENGTH OF REF MENTION
            if len(ref_mentions_list) != 0:
                file_dict['avg_len_of_ref_mention'] = statistics.mean([ref_dict['endOffset'] - ref_dict['startOffset'] for ref_dict in ref_mentions_list])
            else:
                file_dict['avg_len_of_ref_mention'] = 0
            
            #4. NUMBER OF RECENT REFERENCES (current recent ref behnchmark = 4)
            file_dict['num_of_recent_references'] = sum([1 for year in ref_years_list if paper_metadata['year']-year<4])
            ################################ REFERENCES ###################################################
                
            ################################ CONTENT ###################################################
            #print("Extracting content features.....")
            #content housekeeping
            sections = paper_metadata['sections']
            section_content = ''
            for section in sections:
                section_content = section_content + " " + section['text'].translate(punct_removal_table).lower()
            file_dict['contains_githib_link'],file_dict['contains_appendix'] = False,False
            
            #1. NUMBER OF SECTIONS
            file_dict['number_of_sections'] = len(sections)
            
            #2. CONTAINS GITHUB LINK
            for section in sections:
                if 'github' in section['text'].lower():
                    file_dict['contains_githib_link'] = True
                    break
            
            #3. READABILITY
            flesch_score,dale_chall_score = 0,0
            for section in sections:
                flesch_score = flesch_score + textstat.flesch_reading_ease(section['text'])
                dale_chall_score = dale_chall_score + textstat.dale_chall_readability_score(section['text'])
            flesch_score,dale_chall_score = flesch_score/file_dict['number_of_sections'],dale_chall_score/file_dict['number_of_sections']  
            file_dict['content_complexity'] = ((1/flesch_score) + dale_chall_score)/2
            
            #4. CONTAINS APPENDIX
            for section in sections:
                if section['heading'] is not None:
                    if 'APPENDIX' in section['heading'] or section['heading'].split()[0] in set(string.ascii_uppercase):
                        file_dict['contains_appendix'] = True
                        break
            
            #5. NUMBER OF UNIQUE WORDS
            file_dict['number_of_unique_words'] = len(Counter(section_content))
            ################################ CONTENT ###################################################

        list_of_file_dicts.append(file_dict)
        #print("Closing file : ",filename)  
    paper_data_df = pd.DataFrame(list_of_file_dicts)     
    return paper_data_df

In [11]:
paper_data_df = load_data_files_into_raw_df()

100%|██████████| 349/349 [03:08<00:00,  1.85it/s]


In [12]:
paper_data_df

Unnamed: 0,paper_id,feature_extraction_encoding,tfidf_encoding,words_from_top_200_title,abstract_length,abstract_complexity,abstract_novelty,number_of_authors,research_strength_score,num_of_references,most_recent_ref_year,avg_len_of_ref_mention,num_of_recent_references,contains_githib_link,contains_appendix,number_of_sections,content_complexity,number_of_unique_words
0,305.pdf,"[0.3867202699184418, -0.061806850135326385, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,198,9.676772,False,2,6.000000,26,2016,0.000000,9,False,True,11,3.984653,69
1,306.pdf,"[0.4459645450115204, -0.025260714814066887, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,140,7.654289,False,2,32.066667,30,2016,109.228571,15,True,False,17,4.157808,52
2,304.pdf,"[0.44415536522865295, -0.003114901017397642, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,136,8.265020,False,3,16.300000,6,2016,90.833333,6,False,True,30,3.634356,55
3,307.pdf,"[0.39931002259254456, 0.00502351950854063, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",False,163,8.715491,False,0,6.933333,34,2016,42.326531,22,False,True,21,4.365226,49
4,308.pdf,"[0.3834475576877594, -0.032056376338005066, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,110,7.268245,False,2,7.666667,13,2016,59.066667,9,False,True,10,3.705192,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,785.pdf,"[0.41694310307502747, -0.045325443148612976, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,123,7.237888,False,4,6.666667,19,2016,478.444444,8,False,False,11,3.582217,46
345,787.pdf,"[0.43488961458206177, -0.06781546026468277, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,172,9.110834,True,3,14.500000,21,2016,88.555556,11,False,False,16,4.333800,66
346,789.pdf,"[0.40879935026168823, -0.025200681760907173, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,156,7.455183,False,4,7.133333,20,2016,154.459459,14,True,True,19,3.870994,64
347,790.pdf,"[0.36916759610176086, -0.04860365390777588, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True,119,6.816346,False,1,0.000000,26,2016,55.647059,20,False,True,15,4.281689,52
