In [None]:
my_punctuation = '-!"$%^&*()_+=~:;,<.>?/[\\]\`{|}@'
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    return text

def remove_unicode_chars(text):
    encoded_string = text.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

def clean_text(text):
    text = remove_links(text)
    text = remove_unicode_chars(text)
    text = text.lower()
    text = re.sub('['+my_punctuation + ']+', ' ', text)
    text = text.replace('\\', '')
    text = re.sub('\s+', ' ', text)
    text = re.sub('([0-9]+)', '', text)
    
    return text

def define_stop_words():
    #Custom stopwords
    custom_stopwords = ['come','order','try','go','get','make','drink','plate','dish','restaurant','place']
    
    #Customize stop words
    #spacy_stop_words = nlp.Defaults.stop_words.union(custom_stopwords)
    
    # ALL_STOP_WORDS = spacy + gensim + wordcloud
    #ALL_STOP_WORDS = spacy_stop_words.union(SW).union(stopwords)
    

    # NLTK Stop words
    stop_words = nltk_SW.words('english')
    stop_words.extend(custom_stopwords)#.extend(SW).extend(stopwords)
    return stop_words


def remove_stopwords(text, tokenizer):
    #Custom stopwords
    custom_stopwords = ['come','order','try','go','get','make','drink','plate','dish','restaurant','place']
    
    #Customize stop words
    spacy_stop_words = nlp.Defaults.stop_words.union(custom_stopwords)
    
    # ALL_STOP_WORDS = spacy + gensim + wordcloud
    ALL_STOP_WORDS = spacy_stop_words.union(SW).union(stopwords)
    
    tokens = []
    
    for doc in tokenizer.pipe(text, batch_size=500):
        doc_tokens = []
        for token in doc:
            if token.text.lower() not in ALL_STOP_WORDS:
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)
        
    return tokens

'''

def get_lemmas(text):
    lemmas = []
    
    doc = nlp(text)
    
    for token in doc:
        if ((token.is_stop == False) and (token.is_punct == False) and (token.pos_ != 'PRON')):
            lemmas.append(token.lemma_)
    return lemmas


def generate_lemma_tokens(df):
    tokenizer = Tokenizer(nlp.vocab)
    
    # remove stop words and generate tokens
    df['tokens'] = remove_stopwords(df['clean_text'], tokenizer)
    
    df['tokens_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
    
    df['lemmas'] = df['tokens_to_text'].apply(get_lemmas)
    
    return df
'''

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    ALL_STOP_WORDS = define_stop_words()
    return [[word for word in simple_preprocess(str(doc)) if word not in ALL_STOP_WORDS] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts,trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
def generate_lemma_text(clean_text):
    data = clean_text.values.tolist()
    
    # Tokenization
    data_words = list(sent_to_words(data))
    
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=15, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

    # Do lemmatization keeping only noun, adj, vb, adv
    text_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    return text_lemmatized