In [None]:
# Import libraries
import sys, re, glob
import numpy as np, pandas as pd
from cleantext import clean
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary, MmCorpus
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
from pprint import pprint
import gensim, spacy, logging, warnings
import preprocessor as p

# Set options for preprocessor
p.set_options(p.OPT.URL, p.OPT.EMOJI)

# Set NLTK stop words
stop_words = stopwords.words('english')

# Disable warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Load dataset
df = pd.read_csv('../data/merged.csv', encoding="utf-8-sig")
df.head()

In [None]:
def clean_text(row):
    text = row['merged']
    text = p.clean(text)
    text = clean(text, fix_unicode=True, to_ascii=True, lower=True, no_line_breaks=True,
                 no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True,
                 no_digits=True, no_currency_symbols=True, no_punct=True, lang="en",
                 replace_with_punct="", replace_with_url="", replace_with_email="",
                 replace_with_phone_number="", replace_with_number="", replace_with_digit="",
                 replace_with_currency_symbol="")
    return text

df['clean'] = df.apply(clean_text, axis=1)

In [None]:
def sent_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), min_len=2, deacc=True) 
        yield(sent)  

# Convert to list
data = df.clean.values.tolist()
words = list(sent_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(words, min_count=5, threshold=1, delimiter='_')
trigram = gensim.models.Phrases(bigram[words], threshold=1, delimiter='_')
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    texts_out = [[word for word in simple_preprocess(str(doc), max_len=20) if word not in stop_words] for doc in texts_out] 
    return texts_out

tokens = process_words(words)

# Preprocess text data and create tokens
df['tokens'] = tokens
df.head()

In [None]:
# Save preprocessed text data with tokens
df.to_csv('cleaned-dataset.csv', index=False, encoding='utf-8-sig')