### FINAL DEMO

#### Imports

In [1]:
import pickle
import tensorflow as tf
from tensorflow import keras
import re
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim import models
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
import torch



#### LSTM Model

In [2]:
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)


def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.index_to_key)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [3]:
word2vec_path = 'word2vecmodel.bin'
w2v_model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

#### BERT Model

In [4]:
output_dir = './saved_model/'

print(output_dir)

./saved_model/


In [5]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(output_dir)
model_loaded = BertForSequenceClassification.from_pretrained(output_dir)

Loading BERT tokenizer...


In [6]:
def essay_to_sentences(essay):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(essay.strip())
    return sentences

In [7]:
def checker(sent):
    tokenizer = BertTokenizer.from_pretrained(output_dir)
    model_loaded = BertForSequenceClassification.from_pretrained(output_dir)
    encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
    input_id = encoded_dict['input_ids']

        # And its attention mask (simply differentiates padding from non-padding).
    attention_mask = encoded_dict['attention_mask']
    input_id = torch.LongTensor(input_id)
    attention_mask = torch.LongTensor(attention_mask)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_loaded = model_loaded.to(device)
    input_id = input_id.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)

    logits = outputs[0]
    index = logits.argmax()
    if index == 1:
        return 1
    return 0


In [8]:
def grammar_check(essay):
    sentences = essay_to_sentences(essay)
    n= len(sentences)
    s=0
    for sentence in sentences:
        s+=checker(sentence)
    return (s/n)

In [9]:
Essay = "Dear Local newspaper, I think that people have lately spent too much time on their computers. People who have been doing so are increasing the percentage of obecity. The reason I am writing this to you is because the percentage of obecity is currently increasing, people are spending less time exercising and spending less time interacting with others. First, the percentage of obecity is increasing. The reason is that people just stay indoors, they never feel the need to be fit. This is also risking their lives, all they do is just sit and eat. This could risk the chances of getting deceases."
Essay

'Dear Local newspaper, I think that people have lately spent too much time on their computers. People who have been doing so are increasing the percentage of obecity. The reason I am writing this to you is because the percentage of obecity is currently increasing, people are spending less time exercising and spending less time interacting with others. First, the percentage of obecity is increasing. The reason is that people just stay indoors, they never feel the need to be fit. This is also risking their lives, all they do is just sit and eat. This could risk the chances of getting deceases.'

#### Grammer Check

In [10]:
from datetime import datetime
t0 = datetime.now()
grammar_score=grammar_check(Essay)
print("Grammer Score : ", grammar_score)
t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))    

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Grammer Score :  1.0
Processing time: 0:00:09.051919


#### Relevance Check

In [11]:
def relevance(Essay):
    wordlist = essay_to_wordlist( Essay, remove_stopwords=True )
    essayvec = getAvgFeatureVecs( [wordlist], w2v_model, 300 )
    essayvec = np.array(essayvec)
    essayvec = np.reshape(essayvec, (essayvec.shape[0], 1, essayvec.shape[1]))
    lstm_model =tf.keras.models.load_model("./model_weights/final_lstm.h5")
    preds = lstm_model.predict(essayvec)
    preds = np.around(preds)
    return preds[0][0]

In [12]:
relevance_score = relevance(Essay)
print("Relevance Score : ", relevance_score)

Relevance Score :  8.0


#### Final Score

In [13]:
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
X=[]
X.append(relevance_score)
X.append(grammar_score)
result = loaded_model.predict([X])
print("Final Score : ", result[0])

Final Score :  7.482624107250408
