In [2]:
# setting up libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm 
import statistics
from sklearn.metrics import matthews_corrcoef
from scipy.stats import chi2_contingency
import math
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore

import re
import nltk
import spacy
import string

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

%matplotlib inline

  pd.set_option('display.max_colwidth', -1)


In [3]:
# Reading the dataset
df = pd.read_excel("list_of_abstracts.cleaned.xlsx")
# already removed non-English abstract
print(df.shape)
df.head()

(209, 2)


Unnamed: 0,abstract,size
0,"industry. As the aviation industry grows rapidly to meet the requirements of the increased applications in the aviation sector has been increased sharply in recent years. Among the various clean power sources, energy obtained from hydrogen is considered the future for energy generation in the aviation industry due to its cleanness and abundance. This paper aims to give an overview of the potential aviation applications where hydrogen and fuel cell",451
1,STATUTO Interpretation Act 1889{b) applies to lhe interpretation of these fluidised char binderless briquettes manufactured by the National Coal Government on 22nd July 1963. (1..5.) their .f!enerai purport.) 11 of the Clean Air Act 1956 makes it an offence to emit smoke any building in a smoke control area unless it can be shown that the TheseRegulations declare the fuel mentioned therein to be TIIREEPENCB NeT,414
2,"in the aviation sector has been increased sharply in recent years. Among the various clean power sources, energy obtained from hydrogen is considered the future for energy generation in the aviation industry due to its cleanness and abundance. This paper aims to give an overview of the potential aviation applications where hydrogen and fuel cell technology can be used. Also, the major challenges that limit the wide adoption of hydrogen technology in aviation are highlighted and future research prospects are identified.",524
3,"Worker movements played a crucial role in making workplaces safer. Workplace safety is costly for firms but increases labour supply. A laissez-faire approach leaving safety of workplaces unknown is suboptimal. Safety standards set by better-informed trade unions are output and welfare increasing. Trade between a country with trade unions (the North) and a union-free country (the South) can imply a reduction in work standards in the North. When trade unions are established in the South, the North, including northern unions, tend to lose. Quantitatively, these effects are small and overcompensated by gains in the South.",625
4,"-Cloud computing and its pay-as-you-go model continue to provide significant cost benefits and a seamless service delivery model for cloud consumers. The evolution of small-scale and large-scale geo-distributed datacenters operated and managed by individual cloud service providers raises new challenges in terms of effective global resource sharing and management of autonomously-controlled individual datacenter resources. Earlier solutions for geo-distributed clouds have focused primarily on achieving global efficiency in resource sharing that results in significant inefficiencies in local resource allocation for individual datacenters leading to unfairness in revenue and profit earned. In this paper, we propose a new contracts-based resource sharing model for federated geo-distributed clouds that allows cloud service providers to establish resource sharing contracts with individual datacenters apriori for defined time intervals during a 24 hour time period. Based on the established contracts, individual cloud service providers employ a cost-aware job scheduling and provisioning algorithm that enables tasks to complete and meet their response time requirements. The proposed techniques are evaluated through extensive experiments using realistic workloads and the results demonstrate the effectiveness, scalability and resource sharing efficiency of the proposed model.",1386


In [4]:
## (1) Convert to lower cases
df["abstract_new"] = df["abstract"].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))

In [6]:
# (2) Remove words with a length below 3 characters
df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2 ]))

In [8]:
## (3) Removal of URL's
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
        # remove words starting with https and with www
    return url_pattern.sub(r'', text)

df['abstract_new'] = df['abstract_new'].apply(remove_urls)

In [10]:
# (4) Replace multiple white spaces with one white space

df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([word for word in x.split() ]))

In [12]:
# (5) Remove numbers (how to judge if the number is relevant??)

df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([word for word in x.split() if not word.isdigit()]))

In [14]:
# (6) Remove words contain number
df['abstract_new'] = df.abstract_new.str.replace(r'\S*\d\S*','')

  df['abstract_new'] = df.abstract_new.str.replace(r'\S*\d\S*','')


In [15]:
# (7) Remove all punctuations (for example, parenthesis, comma, period, etc.) 
df['abstract_new'] = df['abstract_new'].str.replace('[^\w\s]','')

  df['abstract_new'] = df['abstract_new'].str.replace('[^\w\s]','')


In [17]:
# (8) Remove Emails
df['abstract_new']=df["abstract_new"].apply(lambda x: ''.join([re.sub('\S*@\S*\s?','', word) for word in x]))

In [18]:
# (9) Remove new line characters
df['abstract_new'] = df['abstract_new'].apply(lambda x: "".join([re.sub('\s+',' ', word) for word in x]))

In [19]:
# (10) Remove distracting single quotes
df["abstract_new"] = df["abstract_new"].apply(lambda x: ''.join([re.sub("\'","", word) for word in x]))

In [21]:
# (11) Spelling Correction ... take longer time to run
# Spell check using text blob
from textblob import TextBlob

df['abstract_new'] = df['abstract_new'].apply(lambda x: str(TextBlob(x).correct()))

In [23]:
# (12) Expand contractions 
import contractions

df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split() ]))

In [25]:
# (13) remove stopwords (the, to be, etc.)
nltk.download('stopwords')

from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/djoko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# increase more stop words .... according to the domain expert!!

STOPWORDS.extend(['from', 'subject', 're', 'edu', 'use'])
STOPWORDS.extend(['auther','paper','review','datum','output','input','result','analysis','case'])

In [29]:
# Function to remove the stopwords
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# Applying the stopwords to 'abstract'
df["abstract_new"] = df["abstract_new"].apply(stopwords)

In [31]:
# (14) Lemmatization (convert words into its base form)
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lmtzr =  WordNetLemmatizer()
df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([lmtzr.lemmatize(word,'v') for word in x.split() ]))

[nltk_data] Downloading package wordnet to /Users/djoko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
# (15) Stemming 

from nltk.stem import PorterStemmer

ps =PorterStemmer()
df["abstract_new"] = df["abstract_new"].apply(lambda x: ' '.join([ps.stem(word) for word in x.split() ]))

In [35]:
df.shape

(209, 3)

In [38]:
# (16) Common word removal

# Checking the first 10 most frequent words or choose 90% of the document = df.shape[0]*0.9
from collections import Counter

cnt = Counter()
for text in df["abstract_new"].values:
    for word in text.split():
        cnt[word] += 1

In [39]:
cnt.most_common(10)
# review first and decide!!

[('growth', 234),
 ('economi', 233),
 ('develop', 223),
 ('inclus', 210),
 ('green', 193),
 ('countri', 184),
 ('fuel', 172),
 ('econom', 170),
 ('educ', 149),
 ('studi', 144)]

In [42]:
# Removing the frequent words
freq = set([w for (w, wc) in cnt.most_common(8)])

# function to remove the frequent words
def freqwords(text):
    return " ".join([word for word in str(text).split() if word not in freq])

# Passing the function freqwords
df["abstract_new"] = df["abstract_new"].apply(freqwords)

In [44]:
df.to_csv('cleaned_abstracts_update.csv', index=False)