In [10]:
import pandas as pd
import spacy
# spaCy based imports
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import en_core_web_sm
import string
import re
from langdetect import detect

In [11]:
# Download and install en_core_web_sm
# !python -m spacy download en_core_web_sm

In [12]:
table = pd.read_pickle("files/2.1 selected row data.pkl")
table = table.reset_index(drop=True)
col_title = "Title"
col_abstract = "Abstract"
col_publication_year= "Publication Year"
col_num_author = "Number of Authors"
col_geographic_term = "Geographic Term"
col_publisher= "Publisher"
col_published_on = "Published On"
col_conference = "Conference"
col_conference_location = "Conference Location"
col_volume = "Volume"
col_issue = "Issue"
col_type = "Type"
col_abstract_clean = "Abstract Clean"
col_abstract_lemma = "Abstract Lemma"
col_geo_clean = col_geographic_term+"clean"
col_geo_sanitized = "Geographic term Sanitized"
all_papers_folder = 'files/'

In [13]:
table

Unnamed: 0,Abstract,Conference,Conference Location,EISSN,Geographic Term,ISSN,Issue,Language,Number of Authors,Publication Year,Published On,Publisher,Record ID,Title,Volume,Type
0,Aviation provides productivity in the form of ...,26th International Congress of the Aeronautica...,United States,,[],,,English,4,20080000.0,,,1515293,Payload Fuel Energy Efficiency as a Metric for...,,Conference
1,Steam traction was never fully developed befor...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[],,,,1,20080000.0,,,1301414,Feasibility of steam traction for coal transpo...,,Conference
2,The Transport Infrastructure Development Corpo...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,1,20080000.0,,,1301413,Operational readiness - making it happen,,Conference
3,At CORE 2004 the authors presented a first pap...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,2,20080000.0,,,1301412,Noise reducing slab track for the Epping to Ch...,,Conference
4,"Due to the daily congestion of highways, railw...","CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,1,20080000.0,,,1301411,Investigation into some design aspects of ball...,,Conference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146967,This study investigates the deadline satisfact...,,,,[],0742-597X,1,English,3,20180100.0,Journal of Management in Engineering,American Society of Civil Engineers,1486668,Modeling and Solving the Deadline Satisfaction...,34,Journal
146968,The toll-adjustment mechanism (TAM) is a hybri...,,,,[Hong Kong (China)],0742-597X,1,English,4,20180100.0,Journal of Management in Engineering,American Society of Civil Engineers,1484046,Real Options Model of Toll-Adjustment Mechanis...,34,Journal
146969,The systems approach is increasingly used as a...,,,1872-9126,[],0003-6870,,English,5,20180100.0,Applied Ergonomics,Elsevier,1483769,A Sociotechnical Systems Approach to Enhance S...,66,Journal
146970,The purpose of this study was to develop a met...,,,1872-9126,[],0003-6870,,English,5,20180100.0,Applied Ergonomics,Elsevier,1483768,A Sociotechnical Systems Approach to Enhance S...,66,Journal


In [14]:
# Creating a spaCy object
nlp = en_core_web_sm.load()

In [15]:
# list of allowed parts of speech
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']

In [24]:
#Lemmatization (or less commonly lemmatization) in linguistics is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.
# For example, the word "walk" is the base form for the word "walking", and hence this is matched in both stemming and lemmatization

def lemmatize_with_spacy(sentence):
    doc = nlp(sentence)
    mytokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ != "-PRON-"]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [25]:
# converting special characters to standard
def clean_abstract(row):
    text = row[col_abstract]
    text = text.replace("\x91", "'")
    text = text.replace("\x92", "'")
    text = text.replace("\x93", '"')
    text = text.replace("\x94", '"')
    text = text.replace("\x96", '-')
    text = text.replace("\x97", '--')
    text = text.replace("%", ' ')
    text = re.sub(r"[^a-zA-Z -'""]"," ",text)
    
    for i in range(9):
        text= text.replace("  ", " ")
    
    return text

In [18]:
table[col_abstract_clean] = table.apply(clean_abstract, axis=1)

In [19]:
table.head()

Unnamed: 0,Abstract,Conference,Conference Location,EISSN,Geographic Term,ISSN,Issue,Language,Number of Authors,Publication Year,Published On,Publisher,Record ID,Title,Volume,Type,Abstract Clean
0,Aviation provides productivity in the form of ...,26th International Congress of the Aeronautica...,United States,,[],,,English,4,20080000.0,,,1515293,Payload Fuel Energy Efficiency as a Metric for...,,Conference,Aviation provides productivity in the form of ...
1,Steam traction was never fully developed befor...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[],,,,1,20080000.0,,,1301414,Feasibility of steam traction for coal transpo...,,Conference,Steam traction was never fully developed befor...
2,The Transport Infrastructure Development Corpo...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,1,20080000.0,,,1301413,Operational readiness - making it happen,,Conference,The Transport Infrastructure Development Corpo...
3,At CORE 2004 the authors presented a first pap...,"CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,2,20080000.0,,,1301412,Noise reducing slab track for the Epping to Ch...,,Conference,At CORE the authors presented a first paper on...
4,"Due to the daily congestion of highways, railw...","CORE 2008, Rail; the core of integrated transp...",Australia,,[Australia],,,,1,20080000.0,,,1301411,Investigation into some design aspects of ball...,,Conference,Due to the daily congestion of highways railwa...


In [26]:
lemma_list = []
for i, r in table.iterrows():
    if i%10000 ==0: 
        print(i)
    lemma_list.append(lemmatize_with_spacy(r[col_abstract_clean]))
table[col_abstract_lemma] = lemma_list

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000


In [27]:
i = 0
print(table.iloc[i][col_abstract])
print()
print(table.iloc[i][col_abstract_clean])
print()
print(table.iloc[i][col_abstract_lemma])


Aviation provides productivity in the form of transporting passengers and cargo long distances in a shorter period of time than is available via land or sea.  Given the recent rise in fuel prices and environmental concerns, a consistent metric is needed for the assessment of commercial aviation fuel efficiency, or equivalently the productivity delivered per unit of fuel consumption or environmental cost. This work presents an assessment of payload fuel energy efficiency (PFEE) as a means to quantify how efficiently the energy consumed by aviation is being used on a fleet-wide basis.

Aviation provides productivity in the form of transporting passengers and cargo long distances in a shorter period of time than is available via land or sea Given the recent rise in fuel prices and environmental concerns a consistent metric is needed for the assessment of commercial aviation fuel efficiency or equivalently the productivity delivered per unit of fuel consumption or environmental cost This w

In [28]:
table.sample(7)

Unnamed: 0,Abstract,Conference,Conference Location,EISSN,Geographic Term,ISSN,Issue,Language,Number of Authors,Publication Year,Published On,Publisher,Record ID,Title,Volume,Type,Abstract Clean,Abstract Lemma
137824,A series of studies conducted in the 1970s we...,Transportation Research Board 97th Annual Meeting,United States,,[Davis (California)],,,English,1,20180000.0,,,1494756,"Bikeway Engineering in the 70s, a Turning Point",,Conference,A series of studies conducted in the 's were v...,series study conduct very influential set futu...
3085,The increasing number of accidents involving u...,,,,[Italy],1358-8265,4.0,English,5,20080000.0,International Journal of Crashworthiness,Taylor & Francis,868576,C IV class tram crashworthiness assessment,13.0,Journal,The increasing number of accidents involving u...,increase number accident involve urban light r...
63015,"Since Latvia joined the European Union, the na...",,,1822-4288,[Latvia],1822-427X,4.0,English,4,20130000.0,Baltic Journal of Road and Bridge Engineering,Vilnius Gediminas Technical University,1307108,Assessment of the Effectiveness of the Road Tr...,8.0,Journal,Since Latvia joined the European Union the nat...,Latvia join European Union nature traffic chan...
85024,Significant pro-competitive changes were made ...,,,1464-5254,[],0308-8839,1.0,English,1,20140100.0,Maritime Policy & Management,Taylor & Francis,1299344,Ocean shipping deregulation restructures the l...,41.0,Journal,Significant pro competitive changes were made ...,significant pro competitive change make Shippi...
31895,The Durham Fatal Landslide Database (DFLD) sho...,,,,[Asia],,4.0,English,1,20101100.0,QUARTERLY JOURNAL OF ENGINEERING GEOLOGY & HYD...,GEOLOGICAL SOCIETY (UK),1100227,The impact of climate change and population gr...,43.0,Journal,The Durham Fatal Landslide Database DFLD shows...,Durham Fatal Landslide Database DFLD show grea...
50881,This article explores the various ways that tr...,2012 Rail Conference,United States,,[],,,English,2,20120000.0,,,1224977,Transit Systems Use Recycling to Reduce Mainte...,,Conference,This article explores the various ways that tr...,article explore various way transit system use...
13334,The main objective of this paper is to present...,Proceedings of the Symposium on Pavement Mecha...,United States,,[],,,English,1,20090000.0,,,902020,State of the Art: Anisotropic Characterization...,,Conference,The main objective of this paper is to present...,main objective paper present state art anisotr...


# Clean Geographic information

In [33]:
import pycountry

In [34]:
states = set(['Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware', 'Florida','Georgia','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming', "District of Columbia"])
print(len(states))
countries = set([country.name for country in pycountry.countries])
countries = countries.union(set(['Bolivia', 'Korea', 'Palestine', 'Russia', 'Taiwan', 'Tanzania', 'Vietnam', 'Iran', "Scotland", "England", "Czech Republic"]))
len(countries)

51


260

In [ ]:
def detect_geographic_region(row):
    if len(row[col_geographic_term]) == 0 and not pd.isnull(row[col_abstract]):
        doc = nlp(row[col_abstract_clean])
        locations = set()
        for x in doc.ents:
            if x.label_ == "GPE":
                locations.add(x.text)
        return list(locations)
    else:
        return row[col_geographic_term]

In [ ]:
table[col_geo_clean] = table.apply(detect_geographic_region, axis=1)

In [ ]:
def sanitize_country_name(row):
    saved_list = row[col_geo_clean]
    new_list = []
    for c in countries:
        new_list.extend([c for saved_country in saved_list if c.lower() in saved_country.lower()])
    
    for s in states:
        new_list.extend(["United States" for saved_country in saved_list if s.lower() in saved_country.lower()])
    
    if "England" in new_list:
        new_list.remove("England")
        new_list.append("United Kingdom")
    
    c_set = set(new_list)
    
    if "Oman" in c_set and "Romania" in c_set:
        c_set.remove("Oman")
    
    if "United States" in c_set and "Jersey" in c_set:
        c_set.remove("Jersey")
    return list(c_set)

In [ ]:
def sanitize_state_name(row):
    saved_list = row[col_geo_clean]
    new_list = []
    if "United States" in row["Sanitize"]:
        for s in states:
            new_list.extend([s for saved_country in saved_list if s.lower() in saved_country.lower()])
    
    c_set = set(new_list)
    
    if "Washington" in c_set and "District of Columbia" in c_set:
        c_set.remove("Washington")
    
    return list(c_set)

In [ ]:
table["Sanitize"] = table.apply(sanitize_country_name, axis=1)
table["Sanitize State"] = table.apply(sanitize_state_name, axis=1)

In [ ]:
table.to_pickle("files/3.0 data_clean_text_region.parquet")
table.to_csv("files/3.0 data_clean_text_region.csv")