In [1]:
import nltk
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import stopwords

In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
corpus = pd.read_csv('datasheets/corpus.csv', header=0)

In [10]:
grams_corpus = list(sent_to_words(corpus['text']))

In [12]:
# Build the bigram and trigram models
bigram = Phrases((grams_corpus), min_count=5, threshold=100) # higher threshold fewer phrases. 5,100
trigram = Phrases(bigram[grams_corpus], threshold=100)  #100

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)



In [16]:
# Remove Stop Words
corpus['text'] = remove_stopwords(corpus['text'])

In [19]:
# Form Bigrams
corpus['text'] = make_bigrams(corpus['text'])

In [21]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=

In [22]:
# Do lemmatization keeping only noun, adj, vb, adv
corpus['text'] = lemmatization(corpus['text'], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [24]:
corpus.to_csv('datasheets/corpus_prepared.csv', sep='\t', encoding='utf-8')