# This notebook preprocesses the data using Pandas instead of Pyspark
### This code is used to compare the runtimes of the distributed and non-distributed approaches

In [1]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as gensim_words
import spacy
sp = spacy.load('en_core_web_sm')

import time

In [2]:
nltk_stopwords = set(stopwords.words('english')) \
                    .union(set(stopwords.words('german'))) \
                    .union(set(stopwords.words('french')))
gensim_stopwords = set(gensim_words)
spacy_stopwords = sp.Defaults.stop_words
# https://countwordsfree.com/stopwords
cwf_stopwords = set(line.strip() for line in open('stop_words.txt'))

all_stopwords = list( nltk_stopwords \
                        .union(gensim_stopwords) \
                        .union(spacy_stopwords) \
                        .union(cwf_stopwords) )

In [3]:
df = pd.read_csv('/home/rikz/Documents/Master/Semester2/SDDM/data/data.csv', index_col=0)
df.head(20)

Unnamed: 0,paper_id,title,list_authors,full_text
0,question0,-,-,How does temperature and humidity affect the t...
1,question1,-,-,Seasonality of transmission
2,question2,-,-,Effectiveness of inter inner travel restriction
3,question3,-,-,Effectiveness of personal protective equipment...
4,question4,-,-,Effectiveness of school distancing
5,question5,-,-,Effectiveness of case isolation isolation of e...
6,question6,-,-,Effectiveness of a multifactorial strategy pre...
7,question7,-,-,Effectiveness of community contact reduction
8,question8,-,-,Significant changes in transmissibility in cha...
9,question9,-,-,Effectiveness of workplace distancing


In [4]:
time_before = time.time()

In [5]:
# Filter out empty papers and duplicates
df = df.dropna(subset=['full_text'])
print("Removed empty papers")
df = df.drop_duplicates(subset=['full_text'])
print("Removed duplicates")

Removed empty papers
Removed duplicates


In [6]:
# Preprocess data
def preprocess(text):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # Make text lower case
    preprocessed = text.lower()
    # Remove numbers and special characters
    preprocessed = re.sub('[^a-z ]+', '', preprocessed)
    # Remove white spaces
    preprocessed = preprocessed.strip()
    # Lemmatize and tokenize text
    preprocessed = [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(preprocessed) \
                    if lemmatizer.lemmatize(w) not in all_stopwords]
    # Remove stop words
#     preprocessed = [w for w in preprocessed if w not in all_stopwords]
    return preprocessed
    
df['preprocessed'] = df.full_text.apply(preprocess)

In [7]:
df.head(20)

Unnamed: 0,paper_id,title,list_authors,full_text,preprocessed
0,question0,-,-,How does temperature and humidity affect the t...,"[doe, temperature, humidity, affect, transmiss..."
1,question1,-,-,Seasonality of transmission,"[seasonality, transmission]"
2,question2,-,-,Effectiveness of inter inner travel restriction,"[effectiveness, inter, travel, restriction]"
3,question3,-,-,Effectiveness of personal protective equipment...,"[effectiveness, personal, protective, equipmen..."
4,question4,-,-,Effectiveness of school distancing,"[effectiveness, school, distancing]"
5,question5,-,-,Effectiveness of case isolation isolation of e...,"[effectiveness, case, isolation, isolation, ex..."
6,question6,-,-,Effectiveness of a multifactorial strategy pre...,"[effectiveness, multifactorial, strategy, prev..."
7,question7,-,-,Effectiveness of community contact reduction,"[effectiveness, community, contact, reduction]"
8,question8,-,-,Significant changes in transmissibility in cha...,"[change, transmissibility, changing, season]"
9,question9,-,-,Effectiveness of workplace distancing,"[effectiveness, workplace, distancing]"


In [8]:
time_after = time.time()

In [9]:
print('Preprocessing time: {} sec'.format(time_after-time_before) )
# Small dataset: ~25 sec 

Preprocessing time: 24.492695331573486 sec
