In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import numpy as np


import string
import re
from pprint import pprint

# NLTK 
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english') #this depends on each language

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
np.random.seed(2020)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
import codecs

In [None]:
col_names = ['question_description','primary_question']#,'question_type','question_description','question_title','answer','answer_date','ministry']
df = pd.read_csv('input_csv/train.csv',names=col_names,skiprows = 1)

In [None]:
BAD_CHARS = ['(?) ?????']
pat = '|'.join(['({})'.format(re.escape(c)) for c in BAD_CHARS])
df = df[~df['primary_question'].str.contains(pat)]
len(df)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['whether','government','governments','fact','aware','ministry','ministries'])

In [None]:
data = df.primary_question.values.tolist()
#print(data)
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[0:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1000) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1000)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])# if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ','VERB', 'ADV'])

In [None]:
string_list = [' '.join(word) for word in data_words_bigrams]
with codecs.open('train.txt', 'w','utf-8') as f:
    for item in string_list:
        print (item ,file=f)

In [None]:
class Sentences(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in codecs.open(self.filename, 'r'):
            yield line.split()

In [None]:
df_read = df.sample(n=20000) 

In [None]:
df_test = df_read[['primary_question']].copy()
BAD_CHARS = ['(?) ?????']
pat = '|'.join(['({})'.format(re.escape(c)) for c in BAD_CHARS])
df_test = df_test[~df_test['primary_question'].str.contains(pat)]
len(df_test)

In [None]:
data_test = df_test.primary_question.values.tolist()
data_test = [re.sub('\s+', ' ', sent) for sent in data_test]
data_test = [re.sub("\'", "", sent) for sent in data_test]

In [None]:
data_test_words = list(sent_to_words(data_test))
data_words_nostops_test = remove_stopwords(data_test_words)

# Form Bigrams
data_words_bigrams_test = make_trigrams(data_words_nostops_test)
#data_lemmatized_test = lemmatization(data_words_bigrams_test, allowed_postags=['NOUN', 'ADJ','VERB', 'ADV'])

In [None]:
string_list = [' '.join(word) for word in data_words_bigrams_test]
with codecs.open('test.txt', 'w','utf-8') as f:
    for item in string_list:
        print (item ,file=f)

In [None]:
source = 'train.txt'
model_file = 'w2v_embedding'
sentences = Sentences(source)
model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=15, workers=4, sg=1, iter=10)#,ns_exponent=-0.50)
model.save(model_file)

In [None]:
source = 'train.txt'
model_file = 'experiments/w2v_embedding'
sentences = Sentences(source)
model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=4, sg=1, iter=10)#,ns_exponent=-0.50)
model.save(model_file)