In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import numpy as np


import string
import re
from pprint import pprint

# NLTK 
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english') #this depends on each language

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
np.random.seed(2020)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
import argparse
import logging
import numpy as np
from time import time
import import_ipynb
import utils as U
import codecs

logging.basicConfig(
    # filename='out.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

Build bigram trigram set similar to the one used for training

In [None]:
col_names = ['question_description','primary_question']#,'question_type','question_description','question_title','answer','answer_date','ministry']
df = pd.read_csv('input_csv/train.csv',names=col_names,skiprows = 1)

In [None]:
BAD_CHARS = ['(?) ?????']
pat = '|'.join(['({})'.format(re.escape(c)) for c in BAD_CHARS])
df = df[~df['primary_question'].str.contains(pat)]

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['whether','government','governments','fact','aware','ministry','ministries'])

In [None]:
data = df.primary_question.values.tolist()
#print(data)
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=1000) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=1000)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
#print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])# if token.pos_ in allowed_postags])
    return texts_out

<b> For top 20 aspect terms </b>

In [None]:
df_top20 = pd.read_csv('output_dir/top20_QA_set.csv',engine='python')
# df_top20.head(5)

In [None]:
data_test = df_top20.primary_question.values.tolist()
data_test = [re.sub('\s+', ' ', sent) for sent in data_test]
data_test = [re.sub("\'", "", sent) for sent in data_test]
data_test_words = list(sent_to_words(data_test))
data_words_nostops_test = remove_stopwords(data_test_words)

# Form Bigrams
data_words_bigrams_test = make_trigrams(data_words_nostops_test)
string_list = [' '.join(word) for word in data_words_bigrams_test]
with codecs.open('test.txt', 'w','utf-8') as f:
    for item in string_list:
        print (item ,file=f)

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out-dir", dest="out_dir_path", type=str, metavar='<str>', required=True,
                    help="The path to the output directory")
parser.add_argument("-e", "--embdim", dest="emb_dim", type=int, metavar='<int>', default=200,
                    help="Embeddings dimension (default=200)")
parser.add_argument("-b", "--batch-size", dest="batch_size", type=int, metavar='<int>', default=8,
                    help="Batch size (default=8)")
parser.add_argument("-v", "--vocab-size", dest="vocab_size", type=int, metavar='<int>', default=9000,
                    help="Vocab size. '0' means no limit (default=9000)")
parser.add_argument("-as", "--aspect-size", dest="aspect_size", type=int, metavar='<int>', default=14,
                    help="The number of aspects specified by users (default=14)")
parser.add_argument("--emb", dest="emb_path", type=str, metavar='<str>', help="The path to the word embeddings file")
parser.add_argument("--epochs", dest="epochs", type=int, metavar='<int>', default=10,
                    help="Number of epochs (default=10)")
parser.add_argument("-n", "--neg-size", dest="neg_size", type=int, metavar='<int>', default=4,
                    help="Number of negative instances (default=4)")
parser.add_argument("--maxlen", dest="maxlen", type=int, metavar='<int>', default=0,
                    help="Maximum allowed number of words during training. '0' means no limit (default=0)")
parser.add_argument("--seed", dest="seed", type=int, metavar='<int>', default=1234, help="Random seed (default=1234)")
parser.add_argument("-a", "--algorithm", dest="algorithm", type=str, metavar='<str>', default='adam',
                    help="Optimization algorithm (rmsprop|sgd|adagrad|adadelta|adam|adamax) (default=adam)")
parser.add_argument("--ortho-reg", dest="ortho_reg", type=float, metavar='<float>', default=0.1,
                    help="The weight of orthogonol regularizaiton (default=0.1)")

In [None]:
args = parser.parse_args("--emb w2v_embedding --aspect-size 18 -o output_dir --epochs 50 --batch-size 512 --neg-size 1 --algorithm adam --vocab-size 0".split())
out_dir = args.out_dir_path
U.mkdir_p(out_dir)
U.print_args(args)

In [None]:
from keras.preprocessing import sequence
import reader as dataset

In [None]:
vocab, train_x, test_x, overall_maxlen = dataset.fetch_data(vocab_size=args.vocab_size, maxlen=args.maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

In [None]:
from model import create_model
import keras.backend as K
from optimizers import get_optimizer

In [None]:
def max_margin_loss(y_true, y_pred):
    return K.mean(y_pred)

optimizer = get_optimizer(args)
model = create_model(args, overall_maxlen, vocab)

## Load the save model parameters
model.load_weights(out_dir + '/model_param_adamlr1')
model.compile(optimizer=optimizer, loss=max_margin_loss, metrics=[max_margin_loss])

In [None]:
vocab_inv = {}
for w, ind in vocab.items():
    vocab_inv[ind] = w

test_fn = K.function([model.get_layer('sentence_input').input, K.learning_phase()],
                     [model.get_layer('att_weights').output, model.get_layer('p_t').output])
att_weights, aspect_probs = test_fn([test_x, 0])


In [None]:
def Sort_Tuple(tup):
    return(sorted(tup, key = lambda x: x[1],reverse=True))

In [None]:
att_write = codecs.open(out_dir + '/att_weights_top20', 'w', 'utf-8')
aspect_word = []

list_of_words = []
for c in range(len(test_x)):

    att_write.write('----------------------------------------\n')
    att_write.write(str(c) + '\n')
    set_words = []
    word_index = [i for i in test_x[c] if i != 0]
    line_len = len(word_index)
    weights = att_weights[c]
    weights = weights[(overall_maxlen - line_len):]
    words = [vocab_inv[i] for i in word_index]
    att_write.write(' '.join(words) + '\n')
    list_of_words.append(words)
    for j in range(len(words)):
        att_write.write(words[j] + ' ' + str(round(weights[j], 3)) + '\n')
        set_words.append(tuple((words[j],round(weights[j], 3))))
        set_words = Sort_Tuple(set_words)
        res = [lis[0] for lis in set_words]
    aspect_word.append(res)

In [None]:
# print(len(aspect_word))
# print(len(df_top20))

In [None]:
def aspectList_toString(Asplist):
    temp_aspList = Asplist[0:1]
    temp_string = ' '.join([str(elem) for elem in temp_aspList]) 
    return temp_string
    

In [None]:
# print(aspect_word[0][0:2])
#sample_list = ['government','prices','gold','aware','spurt','country','last','months']
# print(aspectList_toString(sample_list))


In [None]:
cluster_map = {
     0: 'Banking and Finance', 1: 'Violence', 2: 'Power', 3: 'Information governance',
     4: 'Telecommunication', 5: 'Education', 6: 'Railways',  7: 'Foreign Affairs', 8: 'Ore',
     9: 'Misc', 10: 'Commodities', 11: 'Aviation', 12: 'Disabled, Children and Women welfare', 13: 'SME and Tourism', 
    14: 'Schemes', 15: 'Road Infrastructure', 16: 'Organisation and Companies', 17: 'Water'
 }

In [None]:
label_ids = np.argsort(aspect_probs, axis=1)[:, -1]
predict_labels = [cluster_map[label_id] for label_id in label_ids]

In [None]:
df_top20['predicted_labels'] = predict_labels

In [None]:
df_top20.drop(df_top20.columns[0],axis=1,inplace=True)
#df.to_csv('output_dir/top20_QA_set_with_aspect_labels.csv')
#df_top20

In [None]:
df_top20.drop(['keywords'],axis=1,inplace=True)
df_top20['temp_asp_words'] = aspect_word
#df_top20['aspect_words'] = df_top20.temp_asp_words.map(aspectList_toString)
# df_top20

In [None]:
aspectTerms_list = [['banks','psbs','borrowers','write','defaulters','npa','nationalised','nationalized','npas','performing_assets_npas','defaulted','rbi','borrowing','lenders','performing_assets','loans','bank','waived','loan','sbi','lending','nbfcs','corporates','debts','borrowings','rrbs','atms','deposits','credit','waiver',],['naxalites','encounters','civilians','insurgency','naxalite','terrorists','encounter','militants','maoists','firing','killings','bsf','extremists','fighting','crpf','naxals','jawans','militant','naxalism','paramilitary','maoist','troops','innocent','itbp','rifles','killing','attacks','policemen','forces','strikes',],['power','mw','thermal','energy','megawatt','megawatts','ntpc','solar','hydro','reactors','plants','generation','electricity','reactor','generating','nuclear','renewable','ultra_mega','nhpc','umpps','wind','kudankulam','rooftop','hydel','grid','npcil','atomic','fuel','jaitapur','ramagundam',],['constitutional','lokpal','petition','judgement','hearing','petitions','appeal','rajya_sabha','honble','cic','cpc','observations','anomalies','ceo','judgment','legislations','commissions','verdict','reply','constitution','composition','legislative','appeals','court','ordinance','lok_sabha','filing','passed','hon_ble','writing',],['phone','calls','sim','mobile','unsolicited','customers','roaming','signals','phones','sms','networks','mobiles','vodafone','telephones','telephony','portability_mnp','landline','internet','messages','voice','providers','gsm','telecom','cable','towers','operators','subscribers','cellular','misleading_advertisements','computers',],['learning','graduation','taught','students','iits','teacher','universities','iims','affiliated','courses','admissions','textbooks','campuses','cbse','studying','sanskrit','academic','admission','schools','colleges','coaching','nits','degrees','navodaya','pursue','institutes','secondary','exams','teachers','undergraduate',],['railway','railways','trains','rail','tracks','platforms','emu','suburban','railwaysa','broad_gauge','junction','train','passenger','shatabdi','divisions','narrow_gauge','rangia','station','gauge_conversion','ticketing','stalls','duronto','compartments','rajdhani','irctc','superfast','coach','coaches','ticket','secunderabad',],['washington','brazil','malaysia','israel','germany','maldives','canada','afghanistan','brics','pact','saarc','russia','asean','uk','korea','kingdom','iran','mauritius','america','negotiations','fta','vietnam','paris','dialogue','bilateral','africa','counterpart','indonesia','ties','summit',],['lignite','bauxite','ore','mines','leases','coal','manganese','mcl','mining','gmdc','mahanadi_coalfields','iron_ore','vedanta','mine','miners','coalfields','minerals','extraction','excavation','mineral','posco','drilling','lease','bccl','captive','stone','sand','blocks','hills','forests',],['years','last','three','year','yearwise','five','two','wise','programmefor','current','months','past','mohlai','rests','fdr','mtcr','saurashtraand','chichira','postgraduates','ucil','inpossession','shatab','ucbs','naicker','sovereignity','corresponding','kalam','tangdar','magwani','interested',],['onion','onions','wheat','rice','pulses','potato','rabi','produces','bumper','soyabean','quintal','paddy','cereals','mustard','edible','tomato','vegetables','edible_oils','dal','sugarcane','maize','horticultural','oilseeds','fruits','grains','kharif','arhar','acreage','sowing','vegetable',],['flights','air','flight','ai','airlines','aircrafts','runway','carrier','carriers','landing','fleet','airports','kingfisher','airline','aircraft','airport','cargo','aai','jet_airways','pilots','jet','flying','minute','privatise','fares','dreamliner','trivandrum','planes','baggage','boarding',],['infants','children','kids','aged','anaemia','destitute','malnutrition','mothers','malnourished','marriage','child','beggars','abused','begging','underweight','married','leprosy','juveniles','prevalence','elderly','female','abuse','trafficked','orphanages','juvenile','marriages','mortality','adult','hiv','rape',],['handicraft','apparel','handicrafts','leather','textiles','competitiveness','cottage','msme','tourism','textile','promoting','intensive','industry','thrust','attracting','oriented','handloom','agro','cluster','innovation','handlooms','cooperatives','inclusive','fpis','cruise','powerloom','processing','dairy','clusters','exclusive',],['yojana','bima','swarozgar','rsby','ujjwala','pmjdy','sgsy','deen_dayal_upadhyaya','jyoti','nirman','rashtriya_swasthya_bima','iay','pmksy','grameen','jan_dhan','awaas','swarojgar','rajiv','grameen_vidyutikaran','nirmal','awas','swarnjayanti','gramin','saansad_adarsh','sagy','aay','deendayal','sabla','abhiyan','rggvy',],['highway','highways','lane','nh','roads','road','laning','stretches','expressways','nhs','stretch','ring','expressway','lanes','km','bypass','kms','nhai','nhdp','flyovers','length','bridges','widening','toll','golden_quadrilateral','kilometers','toll_plazas','rampur','bridges_robs','bro',],['doctors','specialists','engineers','contractual','unani','specialist','esi','dispensaries','safdarjung_hospital','nursing','empanelled','sciences_aiims','cghs','ayurvedic','deputation','employee','hospital','retired','retirement','employeesa','dispensary','resigned','employees','inducted','ayurveda','esic','hands','quit','homoeopathy','cabin_crew',],['water','groundwater','lakes','contamination','river','ponds','ground','polluted','arsenic','harvesting','effluents','rivers','drinking','rain','tributaries','yamuna','potable','ganga','depletion','fluoride','reservoirs','flood','surface','flows','depleting','soil','toxic','flowing','narmada','glaciers',],]

In [None]:
key_list = list(cluster_map.keys()) 
val_list = list(cluster_map.values()) 
def find_aspectTerm_in_aspect(asp_list,pred_lab):
    #print(test_list)
    #print(pred_lab)
    aspect_index = key_list[val_list.index(pred_lab)]
    #print(aspectTerms_list[aspect_index])
    for item in asp_list:
        if item in aspectTerms_list[aspect_index]:
            return item
    
    return 'NA'
    
# l = ['trade', 'us', 'american', 'countries', 'trade', 'american', 'countries', 'investors', 'order', 'capitalize', 'lull', 'latin', 'proposed', 'open', 'four', 'five', 'centres', 'latin', 'permanent', 'basis', 'create', 'awareness', 'among', 'business', 'people', 'alike', 'indian', 'products']
# val = find_aspectTerm_in_aspect(l,'Foreign Affairs')
# print(val)
# l1 = ['sezs', 'approval', 'principle', 'details', 'notified', 'given', 'country', 'till', 'october', 'state', 'wise']
# val = find_aspectTerm_in_aspect(l1,'Information governance')
# print(val)
# l2 = ['export', 'year', 'envisaged', 'earning', 'targets', 'fiscal']
# val=find_aspectTerm_in_aspect(l2,'Misc')
# print(val)
# # 
# aspect_index
#df_top20.apply(lambda x: find_aspectTerm_in_aspect(x.temp_asp_words, x.predicted_labels), axis=1)

In [None]:
df_top20['aspect_word'] = df_top20.apply(lambda x: find_aspectTerm_in_aspect(x.temp_asp_words, x.predicted_labels), axis=1)
df_top20 = df_top20[~df_top20.aspect_word.str.contains('NA')]
df_top20.dropna(inplace=True)
len(df_top20)

<b> Adding polarity labels </b>

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
def get_questionPolarity(question):
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(question)['compound']
    #print(score)
    if (score > 0.2):
        return 'positive'
    elif (score < -0.2):
        return 'negative'
    else:
        return 'neutral'

In [None]:
import regex as re
# from nltk.tokenize import sent_tokenize
# nltk.download('punkt')
# def clean_string(sent):
#     to_remove='&*:;'
#     sent = sent.translate(str.maketrans('','',to_remove))
#     list_of = ['(a)', '(b)', '(c)', '(d)', '(e)','(f)','(i)','(ii)','(iii)']
#     for word in list_of:
#         sent = sent.replace(word,"")
#     _RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
#     _RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
#     sent = _RE_COMBINE_WHITESPACE.sub(" ", sent)
#     sent = _RE_STRIP_WHITESPACE.sub("", sent)
#     return sent

# def get_polarity(answer):
#     answer = clean_string(answer)
#     article_tokenized = sent_tokenize(answer)
#     analyser = SentimentIntensityAnalyzer()
#     comp_scores=[]
#     neg_score=0
#     pos_score=0
#     total_count=0
#     for i in article_tokenized:
#         #print(analyser.polarity_scores(i))
#         comp_scores.append(analyser.polarity_scores(i)['compound'])
#     #print(comp_scores)
#     for score in comp_scores:
#         if (score > 0.5):
#             pos_score = pos_score+1
#         elif (score < -0.5):
#             neg_score=neg_score+1

#         total_count = total_count+1

#     if(neg_score!=0):
#         return "negative"
#     elif (pos_score!=0 and pos_score/total_count >= 0.15):
#         return "positive"
#     else:
#         return "neutral"
def get_answerPolarity(answer):
    #print(answer)
    answer_text = re.sub(r'\d+', '',answer)
    #print(answer_text)
    analyser = SentimentIntensityAnalyzer()
    score = analyser.polarity_scores(answer_text)['compound']
    #print(score)
    if (score > 0.2):
        return 'positive'
    elif (score < -0.2):
        return 'negative'
    else:
        return 'neutral'

In [None]:
df_top20['question_polarity'] = df_top20.primary_question.map(get_questionPolarity)
df_top20['answer_polarity'] = df_top20.primary_answer.map(get_answerPolarity)

In [None]:
 df_top20

In [None]:
df_top20.to_csv('output_dir/top20_QA_set_with_aspect_labels.csv')

In [None]:
df_out = df_top20[['primary_question','predicted_labels','aspect_word','question_polarity']].copy()
df_out['primary_question'] = df_out['primary_question'].str.lower()
df_out['predicted_labels'] = df_out['predicted_labels'].str.lower()
df_out['aspect_word'] = df_out['aspect_word'].str.lower()
sentiment = {'negative': 0,'neutral': 1, 'positive':2} 
df_out['question_polarity'] = [sentiment[item] for item in df_out.question_polarity]
# df_out

In [None]:
from unicodedata import normalize
_RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
_RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
def aspect_position(question,aspect_word):
    question = question.replace('-',' ')
    question = question.replace('/',' ')
    question = question.replace('(',' ')
    question = question.replace(')',' ')
    question = question.replace(',',' ')
    question = question.replace('.',' ')
    question = question.replace('\'','')
    question = question.replace('mw',' mw')
    question = question.replace('kms',' kms')
    question = question.replace('<u+0080><u+0099>',' ')
    question = normalize('NFKD', question).encode('ascii','ignore').decode('utf-8')
    question = _RE_COMBINE_WHITESPACE.sub(" ", question)
    question = _RE_STRIP_WHITESPACE.sub("", question)
    question = question.replace('road20','road')
    question = question.replace('nh200','nh')
    question = question.replace('nh1','nh')
    #print(aspect_word)
    word = aspect_word
    word = word.replace('_',' ')
    word = word.split(' ')[0]
#     print(question)
#     print(word)
    start_index = re.search(r'\b{0}\b'.format(word),question)
    #print(start_index.start())
    end = start_index.start() + len(word)
    return question,word,start_index.start(),end

In [None]:
df_out['question'],df_out['aspect_term'],df_out['from'],df_out['to'] = zip(*df_out.apply(lambda x: aspect_position(x.primary_question, x.aspect_word), axis=1))

In [None]:
df_out.drop(['primary_question','aspect_word','predicted_labels'],axis=1,inplace=True)
df_out.rename(columns = {'aspect_term':'aspect','question_polarity':'sentiment'},inplace=True)
# df_out

In [None]:
df_out.to_csv('../supervised/input_data/input.csv')