In [0]:
#####################################################################################
######################### IMPORTING THE LIBRARIES AND DATASET #######################
#####################################################################################

In [0]:
import pandas as pd
import numpy as np
import codecs
from tqdm import tqdm

# FOR PLOTTING GRAPHS
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth',300)

# FOR REMOVING SPECIAL CHARACTERS, LINKS, AND EXPANDING WORDS,TAGS
import re
import unicodedata
from bs4 import BeautifulSoup

# FOR STEMMING AND REMOVING STOP WORDS
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer   
from nltk.corpus import wordnet
from nltk import pos_tag

# FOR BUILDING THE EMBEDDING MATRIX AND GENERATING THE SEQUENCES
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

# FOR THE GOOGLE WORD TO VECTOR WEIGHTS
import gensim

In [0]:
#####################################################################################
##########################     DOWNLOADING AND LOADING     ##########################
#####################################################################################

In [0]:
# IMPORTING THE DATASET
train_data=pd.read_csv("./TrainingData.csv")
test_data=pd.read_csv("./SubtaskA_Trial_Test_Labeled.csv")
valid_data=pd.read_csv("./SubtaskA_EvaluationData_labeled.csv")

In [4]:
# DOWNLOADING THE PRETRAINED GLOVE WORD TO VECTOR REPRESENTATIONS
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2020-04-07 17:17:27--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-04-07 17:17:27--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-04-07 17:17:28--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [5]:
nltk.download('stopwords')
#### CACHING THE STOP WORDS HELPS IN FASTENING THE REMOVAL OF THE STOP WORDS
cachedStopWords = stopwords.words("english")
nltk.download('wordnet')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

lemmatizer=WordNetLemmatizer()
corpus_words = set(nltk.corpus.words.words())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [12]:
# PEEKING INTO THE TRAIN DATA
print(train_data.shape)
train_data.head()

(8500, 3)


Unnamed: 0,id,sentence,label
0,663_3,"""Please enable removing language code from the Dev Center ""language history"" For example if you ever selected ""ru"" and ""ru-ru"" laguages and you published this xap to the Store then it causes Tile localization to show the en-us(default) tile localization which is bad.""",1
1,663_4,"""Note: in your .csproj file, there is a SupportedCultures entry like this: <SupportedCultures>de-DE;ru;ru-RU </SupportedCultures> When I removed the ""ru"" language code and published my new xap version, the old xap version still remains in the Store with ""Replaced and unpublished"".""",0
2,664_1,"""Wich means the new version not fully replaced the old version and this causes me very serious problems: 1.""",0
3,664_2,"""Some of my users will still receive the old xap version of my app.""",0
4,664_3,"""The store randomly gives the old xap or the new xap version of my app.""",0


In [13]:
# PEEKING INTO THE TEST DATA
print(test_data.shape)
test_data.head()

(592, 3)


Unnamed: 0,id,sentence,label
0,1310_1,"I'm not asking Microsoft to Gives permission like Android so any app can take my data, but don't keep it restricted like iPhone.",1
1,1312_1,somewhere between Android and iPhone.,0
2,1313_1,And in the Windows Store you can flag the App [Requires Trust] for example.,0
3,1313_2,"Many thanks Sameh Hi, As we know, there is a lot of limitations is WP8 OS due the high security in the OS itself which is very good, but some time we need to allow some apps to do extra works, apps which we trust i.e: hotmail app, facebook app, skype app ....",0
4,1313_3,"The idea is that we can develop a regular app and we request our permissions in the manifest, OR the app can ASK FOR TRUST_�_ more",1


In [14]:
# PEEKING INTO THE VALIDATION DATA
print(valid_data.shape)
valid_data.head()

(833, 3)


Unnamed: 0,id,sentence,label
0,9566,This would enable live traffic aware apps.,0
1,9569,Please try other formatting like bold italics shadow to distinguish titles/subtitles from content.,1
2,9576,Since computers were invented to save time I suggest we be allowed to upload them all in one zip file - using numbering for the file names and the portal could place them in the right order.,1
3,9577,Allow rearranging if the user wants to change them!,1
4,9579,Add SIMD instructions for better use of ARM NEON instructions for math and games.,1


In [0]:
#####################################################################################
##########################        CLEANING THE DATA        ##########################
#####################################################################################

In [0]:
CONTRACTION_MAP = {"ain't": "is not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","i'd": "i would","i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so as","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there would","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would","we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have","where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have","you're": "you are","you've": "you have"}

In [0]:
def cleanData(data):

    ## REMOVING ASCENTED CHARACTERS LIKE é
    def removeAscentedCharacters(text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return text
    
    
    ## EXPANDING THE SHORT WORDS:
    def expandContractions(text, contraction_mapping=CONTRACTION_MAP):
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                    if contraction_mapping.get(match)\
                                    else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction
            
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
    
    ## REMOVING FRONT AND BACK INVERTED COMMAS
    def removeIC(text):
        if len(text)>=2:
          if text[0]=='"':
            text = text[1:]
          if text[-1]=='"':
            text = text[:-1]
        return text
    
    ## REMOVING TAGS
    def remove_tags(text):
        soup = BeautifulSoup(text)
        return soup.get_text()

    def deEmojify(inputString):
        return inputString.encode('ascii', 'ignore').decode('ascii')

    def removeSpaces(text):
        text= re.sub(' +', ' ', text)
        if text[0]==' ':
          text=text[1:]
        return text

    def get_simple_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  

    # OUR STEMMING FUNCTION
    def stem(words):
      output_words=[]
      if len(words)!=0:
        words[0] = words[0].lower()
      for w in words:
              pos=pos_tag([w])
              simple_pos = get_simple_pos(pos[0][1])
              clean_word=lemmatizer.lemmatize(w,simple_pos)
              output_words.append(clean_word.lower())
      return output_words

    def stemmizeSentence(sentence):
      output_words = stem(sentence)
      output_wordsf = []
      for i in output_words:
        if i in corpus_words:
          output_wordsf.append(i)
      return output_wordsf

    print('REMOVING ASCENTED CHARACTERS...')
    cleaned = data.apply(lambda x: removeAscentedCharacters(x))
    print('NORMALIZING THE SENTENCE CASE...')
    cleaned = cleaned.apply(lambda x: x.lower())
    print('EXPANDING CONTRACTIONS...')
    cleaned = cleaned.apply(lambda x: expandContractions(x))
    print('REMOVING IC...')
    cleaned=  cleaned.apply(lambda x: removeIC(x))
    print('REMOVING TAGS...')
    cleaned = cleaned.apply(lambda x: remove_tags(x))
    print('REMOVING LINKS...')
    cleaned = cleaned.str.replace("(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*"," ")
    print('REMOVING SPECIAL CHARACTERS...')
    cleaned = cleaned.str.replace("\".*?\"|\(.*?\)|<.*>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|[^a-zA-Z#]"," ")
    print('REMOVING EMOJIS...')
    cleaned = cleaned.apply(lambda x: deEmojify(x))
    print('REMOVING UNNECCESSARY SPACES...')
    cleaned = cleaned.apply(lambda x: removeSpaces(x))
    print('REMOVING THE CONSECUTIVELY REPEATED WORDS...')
    cleaned = cleaned.apply(lambda x: re.sub(r'\b(.+)\s+\1\b', r'\1', x))
    print('REMOVING STOP WORDS...')
    tokenized_sentence = cleaned.apply(lambda x: x.split())
    tokenized_sentence = tokenized_sentence.apply(lambda sentence: [word for word in sentence if len(word)>2 ])
    # tokenized_sentence = tokenized_sentence.apply( lambda sentence: [word for word in sentence if word not in cachedStopWords] )
    print('TOKENIZING AND STEMMING...')
    tokenized_sentence = tokenized_sentence.apply(lambda sentence: stemmizeSentence(sentence))
    print('FINALIZING THE DATA')
    detokenized= tokenized_sentence.apply(lambda x: ' '.join(x))

    return detokenized

In [24]:
x_train = cleanData(train_data['sentence'])
print(x_train.head())
print(x_train.shape)

REMOVING ASCENTED CHARACTERS...
NORMALIZING THE SENTENCE CASE...
EXPANDING CONTRACTIONS...
REMOVING IC...
REMOVING TAGS...


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


REMOVING LINKS...
REMOVING SPECIAL CHARACTERS...
REMOVING EMOJIS...
REMOVING UNNECCESSARY SPACES...
REMOVING THE CONSECUTIVELY REPEATED WORDS...
REMOVING STOP WORDS...
TOKENIZING AND STEMMING...
FINALIZING THE DATA
0    please enable remove language code from the dev center for example you ever select and and you publish this the store then cause tile localization show the tile localization which bad
1                                                    note your file there entry like this when remove the language code and publish new version the old version still remains the store with
2                                                                                                 mean the new version not fully replace the old version and this cause very serious problem
3                                                                                                                                               some user will still receive the old version
4                            

In [25]:
x_valid = cleanData(valid_data['sentence'])
print(x_valid.head())
print(x_valid.shape)

REMOVING ASCENTED CHARACTERS...
NORMALIZING THE SENTENCE CASE...
EXPANDING CONTRACTIONS...
REMOVING IC...
REMOVING TAGS...
REMOVING LINKS...
REMOVING SPECIAL CHARACTERS...
REMOVING EMOJIS...
REMOVING UNNECCESSARY SPACES...
REMOVING THE CONSECUTIVELY REPEATED WORDS...
REMOVING STOP WORDS...
TOKENIZING AND STEMMING...
FINALIZING THE DATA
0                                                                                                                   this would enable live traffic aware
1                                                                       please try other format like bold shadow distinguish title subtitle from content
2    since computer be invent save time suggest allow them all one zip file use number for the file name and the portal could place them the right order
3                                                                                                              allow rearrange the user want change them
4                                                 

In [26]:
x_test = cleanData(test_data['sentence'])
print(x_test.head())
print(x_test.shape)

REMOVING ASCENTED CHARACTERS...
NORMALIZING THE SENTENCE CASE...
EXPANDING CONTRACTIONS...
REMOVING IC...
REMOVING TAGS...
REMOVING LINKS...
REMOVING SPECIAL CHARACTERS...
REMOVING EMOJIS...
REMOVING UNNECCESSARY SPACES...
REMOVING THE CONSECUTIVELY REPEATED WORDS...
REMOVING STOP WORDS...
TOKENIZING AND STEMMING...
FINALIZING THE DATA
0                                                              not ask give permission like android any can take data but not keep restrict like
1                                                                                                                  somewhere between android and
2                                                                                and the window store you can flag the require trust for example
3    many thanks know there lot limitation due the high security the itself which very good but some time need allow some extra work which trust
4                                           the idea that can develop regular and 

In [0]:
y_train=train_data['label']
y_valid=valid_data['label']
y_test=test_data['label']

In [0]:
#####################################################################################
########################## BUILDING THE EMBEDDING MATRIX   ##########################
#####################################################################################

In [29]:
# BUILDING VOCABULARY FROM THE SENTENCES
# THIS WILL HELP IN GETTING THE INPUT SEQUENCES FOR THE 
mxlen=0
tokenized=x_train.apply(lambda x: x.split())
for tokens in tokenized:
  mxlen=max(mxlen,len(tokens))
print('MAX LEN', mxlen)

MAX LEN 126


In [30]:
all_words= [ word for tokens in tokenized for word in tokens]
vocab = sorted(list(set(all_words)))
vocab_train_len=len(vocab)
print('VOCAB SIZE',len(vocab))

VOCAB SIZE 3819


In [0]:
# MAX LEN OF AN INPUT SEQUENCE
MXSEQLEN=126
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300

In [32]:
# BUILDING TOKENIZER FROM THE TRAINING DATA
tokenizer = Tokenizer(num_words=vocab_train_len, lower=True, char_level=False)
tokenizer.fit_on_texts(x_train.tolist())
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 3819 unique tokens.


In [33]:
# form the sequences that will be the input to the network
# padd or remove values to make sequences of equal length
train_word_index= tokenizer.word_index
train_sequence = tokenizer.texts_to_sequences(x_train.tolist())
train_sequence = sequence.pad_sequences(train_sequence, maxlen=MXSEQLEN)
print(train_sequence)

[[   0    0    0 ...  804   37  341]
 [   0    0    0 ...    1   35    9]
 [   0    0    0 ...   78  805   84]
 ...
 [   0    0    0 ...   37 1192   12]
 [   0    0    0 ...  530  740   18]
 [   0    0    0 ...    2   29  137]]


In [34]:
 #TEST SEQUENCE BUILT FROM THE SAME TRAINING VOCABULARY
test_sequence = tokenizer.texts_to_sequences(x_test.tolist())
test_sequence = sequence.pad_sequences(test_sequence, maxlen=MXSEQLEN)
print(test_sequence)

[[   0    0    0 ...  279 1092   17]
 [   0    0    0 ...  237  125    2]
 [   0    0    0 ...  731    3   85]
 ...
 [   0    0    0 ...  342    3  236]
 [   0    0    0 ...    1  264  272]
 [   0    0    0 ...  175    1  351]]


In [35]:
# TEST SEQUENCE BUILT FROM THE SAME TRAINING VOCABULARY
valid_sequence = tokenizer.texts_to_sequences(x_valid.tolist())
valid_sequence = sequence.pad_sequences(valid_sequence, maxlen=MXSEQLEN)
print(valid_sequence)

[[   0    0    0 ...  292 1624 1363]
 [   0    0    0 ... 1362   19  131]
 [   0    0    0 ...    1  106  383]
 ...
 [   0    0    0 ...   11   78  823]
 [   0    0    0 ...    6   60   83]
 [   0    0    0 ...    8  190  844]]


In [46]:
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('./glove.6B.300d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

900it [00:00, 8999.17it/s]

loading word embeddings...


400000it [00:47, 8477.55it/s]

found 400000 word vectors





In [47]:
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(train_word_index)+1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in train_word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 35


In [48]:
print("sample words not found: ", np.random.choice(words_not_found, 10))
print(embedding_matrix.shape)

sample words not found:  ['productid' 'openable' 'unhandled' 'connectable' 'productid' 'unpinned'
 'unpinned' 'semitransparency' 'connectable' 'unhide']
(3820, 300)


In [0]:
import pickle

variables = {
    'train_sequence' : train_sequence,
    'test_sequence' : test_sequence,
    'valid_sequence' :valid_sequence,
    'y_train':y_train,
    'y_test':y_test,
    'y_valid':y_valid,
    'train_embedding_weights':embedding_matrix,
    'EMBEDDING_DIM':EMBEDDING_DIM,
    'MXSEQLEN':MXSEQLEN,
    'train_word_index':train_word_index
}

pickle.dump(variables,open('./Data/variablesGloVe126','wb'))