In [None]:
!pip install contractions
!pip install nltk
!pip install gensim

import numpy
from numpy.random import seed
import pandas as pd
import contractions

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim.models import FastText

import string
import re

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten
from keras.layers.embeddings import Embedding
from keras import regularizers
from keras.models import model_from_json
from keras import backend

from tensorflow import set_random_seed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Cleaning the data

The `clean_data()` method reads the `file_name` passed onto it and does the following to each line of text
* Split the decision - `Yes` or `No` by `\t` within a sentence and the lines by `\n`
* Converts the contents to lower case
* Expands contractions, e.g. converts `you've` to `you have`
* Removes hyperlinks since they can be arbitrary which might throw our model off track
* Removes email addresses
* Removes numbers
* Removes punctuations
* Assigns integer value of `1` to `yes` and `0` to `no` for classification purposes
* Extracts noun-phrases from the file

In [None]:
'''
clean_data() returns the feature set, decision
and the associated noun-phrases"
'''
def clean_data(file):
    file = open(file, 'r', encoding="utf-8")
    corpus = file.read()

    # list of stopwords from the english language
    stop_words = stopwords.words('english')
    # retrieving punctuations from string module
    punctuations = [i for i in string.punctuation]

    # splitting the data into (X,Y) for training/testing purposes by "\t"
    # splitting the corpus into observations by "\n"
    data = [i.split("\t") for i in corpus.split("\n")]

    # decision vector - yes or no
    target = [t[0].lower() for t in data]
    # associated text
    text = [contractions.fix(t[1].lower()) for t in data]

    # ---- Extracting Noun-Phrases from text ----

    # "noun_phrases" contains individual lists of noun-phrases from all sentences
    noun_phrases = list()
    for sentence in text:
        # "per_noun_phrases" contain all noun-phrases from each sentence
        per_noun_phrases = list()
        # removing all integers from sentences
        sentence = re.sub(r'\d+', "", sentence)
        # taking care of residual spaces after digit removal
        sentence = sentence.replace("  ", " ")
        # tokenizing the sentence into words
        words = nltk.word_tokenize(sentence)
        # empty string "np" will take in individual noun-phrases
        np = ''

        for w in words:
            if w not in stop_words+punctuations:
                np += w + ' '
            else:
                if np != '':
                    per_noun_phrases.append(np.strip())
                    np = ''
        noun_phrases.append(per_noun_phrases)

    # remove hyperlinks
    text = [re.sub(r"http\S+", "",t[1].lower()) for t in data]
    # remove email addresses
    text = [re.sub(r"[^@]+@[^@]+\.[^@]+", "", t) for t in text]

    # removing blank observations
    df = pd.DataFrame( {'text': text, 'target': target})
    df = df[df.iloc[:,0] != '']
    text = df['text'].values.tolist()
    target = df['target'].values.tolist()

    # assigning unique indices to 'yes' and 'no'
    for x in target:
        if(x=="no"):
            target[target.index(x)] = 0
        else:
            target[target.index(x)] = 1

    lemmatizer = WordNetLemmatizer()

    for x in text:
        tokens = word_tokenize(x)
        tokens = [w.lower() for w in tokens]
        # removing punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove numbers
        words = [word for word in stripped if word.isalpha()]
        # filter out stop words
        stop_words = set(stopwords.words('english'))
        words = [lemmatizer.lemmatize(w) for w in words if not w in stop_words]
        text[text.index(x)] = words

    return text, target, noun_phrases

In [None]:
# retrieving cleaned data and noun-phrases
# from train and test datasets
text_train, target_train, noun_phrases_train = clean_data("train.txt")
text_test, target_test, noun_phrases_test = clean_data("test.txt")

# all data tokenized
# converting back to cleaned sentences
train = [' '.join(x) for x in text_train]
test = [' '.join(x) for x in text_test]

## Generating Word Embeddings using FastText

In [None]:
# training word embeddings from a training corpus
# additional ability to obtain word vectors for out-of-vocabulary words.
model_ted = FastText(text_train + text_test, size=1000, window=5, min_count=5, workers=4,sg=1)

In [None]:
# 10,000 should be enough number of vocabulary items
vocab_size = 10000
# combines the train and test set
# finds one-hot vector of every word
encoded_docs = [one_hot(d, vocab_size) for d in train+test]

# sequences are not of equal lengths
# keras requires all vectors to be of equal length
max_length = 131
padded_docs = pad_sequences(encoded_docs[:len(train)], maxlen=max_length)
padded_docs_test = pad_sequences(encoded_docs[len(train):], maxlen=max_length)

## Establishing the Neural Network

In [None]:
# to ensure reproducible results
seed(1)
if(len(backend.tensorflow_backend._get_available_gpus())==0):
    print("CPU runtime...")
    set_random_seed(3)
else:
    print("GPU runtime...")
    set_random_seed(2)

# establishing a neural-network
# experimentation led to 18 epochs
# avoiding over-fitting and under-fitting
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

'''
AdaDelta no manual tuning of a learning rate and
appears robust to noisy gradient information,
different model architecture choices, various data modalities
and selection of hyperparameters.
'''
model.compile(optimizer="adadelta", loss='mean_squared_error', metrics=['acc'])

model.fit(padded_docs, target_train, epochs=20, verbose=1,batch_size=128)

# testing for accuracy on test dataset
print()
loss, accuracy = model.evaluate(padded_docs_test, target_test, verbose=1)
print("Accuracy: {0:.2f} %".format(accuracy*100))

# saving the model for future use
try:
    save = input("\nSave Model? (yes=1, no=otherwise): ")
    if(save=='1'):
        name = input("Save as: ")
        model_json = model.to_json()
        with open("{}.json".format(name), "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("{}.h5".format(name))
        print("Saved {0}.json and {0}.h5 to disk".format(name))
    else:
        print("Model not saved")
except:
    pass

GPU runtime...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy: 77.72 %


## Loading the model to run to test dataset

In [None]:
models = !ls ""*.{h5,json}
models = models[0].split("\t")

mo = list()
for m in models:
    mod = m.rsplit('.', 1)[0]
    if mod not in mo:
        mo.append(mod)
models = mo
try:
    name = input("Load model: {}\n".format(models))
    if name not in models:
        print("Model: {} not available. \nLoading model: model".format(name))
        name = "model"
except:
    print("Loading model: model")
    name = "model"

Load model: ['model_GPU_acc_80.54']
model_GPU_acc_80.54


In [None]:
json_file = open('{}.json'.format(name), 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
print("Loaded {}.json from disk".format(name))
# load weights into new model
loaded_model.load_weights("{}.h5".format(name))
print("Loaded {}.h5 from disk".format(name))

Loaded model_GPU_acc_80.54.json from disk
Loaded model_GPU_acc_80.54.h5 from disk


In [None]:
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(padded_docs_test, target_test, verbose=1)
print("{0} Accuracy: {1:.2f} %".format(name ,(score[1]*100)))

model_GPU_acc_80.54 Accuracy: 80.54 %


## Noun-Phrase Detection Example

In [None]:
text = ['Today is a very great day. Indian politicians are very corrupt']
noun_phrases_eg = list()
# list of stopwords from the english language
stop_words = stopwords.words('english')
# retrieving punctuations from string module
punctuations = [i for i in string.punctuation]
for sentence in text:
    # "per_noun_phrases" contain all noun-phrases from each sentence
    per_noun_phrases = list()
    # removing all integers from sentences
    sentence = re.sub(r'\d+', "", sentence)
    # taking care of residual spaces after digit removal
    sentence = sentence.replace("  ", " ")
    # tokenizing the sentence into words
    words = nltk.word_tokenize(sentence)
    # empty string "np" will take in individual noun-phrases
    np = ''

    for w in words:
        if w not in stop_words+punctuations:
            np += w + ' '
        else:
            if np != '':
                per_noun_phrases.append(np.strip())
                np = ''
    noun_phrases_eg.append(per_noun_phrases)
print(noun_phrases_eg)

[['Today', 'great day', 'Indian politicians']]
