<a href="https://colab.research.google.com/github/sharonwu827/Deep-Learning/blob/master/Automated_Essay_Scoring_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#load dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import package
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


!pip install transformers
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Keras functional API
from keras.models import Sequential
from keras.models import Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers import SpatialDropout1D
from keras.layers import Input

import os
import csv
import copy
import nltk
import nltk.tokenize as tk
from sklearn.feature_extraction.text import CountVectorizer
os.chdir('/content/drive/My Drive')
nltk.download('punkt')

!pip install pyenchant
!pip install -U pip setuptools wheel
!pip install -U spacy
!sudo apt-get install libenchant1c2a
import enchant #  Enchant spellchecking library
import spacy
import re
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
import urllib.request
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libenchant1c2a is already the newest version (1.6.0-11.1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [3]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/asap-aes/training_set_rel3.tsv",sep='\t', encoding='ISO-8859-1')
dev = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/asap-aes/valid_set.tsv",sep='\t', encoding='ISO-8859-1')
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/asap-aes/test_set.tsv",sep='\t', encoding='ISO-8859-1')

In [4]:
prompt = pd.DataFrame({"essay_set":[1,2,3,4,5,6,7,8],
                       "prompt":["More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends. Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.",
                                "Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive? Support your position with convincing arguments from your own experience, observations, and/or reading.",
                                "Write a response that explains how the features of the setting affect the cyclist. In your response, include examples from the essay that support your conclusion.",
                                "Write a response that explains why the author concludes the story with this paragraph. In your response, include details and examples from the story that support your ideas.",
                                "Describe the mood created by the author in the memoir. Support your answer with relevant and specific information from the memoir.",
                                "Based on the excerpt, describe the obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock there. Support your answer with relevant and specific information from the excerpt.",
                                "Write about patience. Being patient means that you are understanding and tolerant. A patient person experience difficulties without complaining.Do only one of the following: write a story about a time when you were patient OR write a story about a time when someone you know was patient OR write a story in your own way about patience.",
                                "We all understand the benefits of laughter. For example, someone once said, “Laughter is the shortest distance between two people.” Many other people believe that laughter is an important part of any relationship. Tell a true story in which laughter was one element or part."]})

## Data processing

In [19]:
def preprocess(df):
  df['normalized_score'] = df['domain1_score'] / df.groupby('essay_set')['domain1_score'].transform('max')

preprocess(train)

In [9]:
def clean_anonymization(essay):
  '''
  function to remove the anoymaization
  '''
  res=[]
  for i in essay.split():
    if i.startswith("@"):
      continue
    else:
      res.append(i)
  return ' '.join(res)

train['essay']=train['essay'].apply(lambda x:clean_anonymization(x))
# also remove from dev and test
dev['essay']=dev['essay'].apply(lambda x:clean_anonymization(x))
test['essay']=test['essay'].apply(lambda x:clean_anonymization(x))

In [18]:
train.head(3)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,rater1_trait2,rater1_trait3,rater1_trait4,rater1_trait5,rater1_trait6,rater2_trait1,rater2_trait2,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,normalized_score
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,,,,,,,,,,,,,,,,,,0.666667
1,2,1,Dear I believe that using computers will benef...,5,4,,9,,,,,,,,,,,,,,,,,,,,,,0.75
2,3,1,"Dear, More and more people use computers, but ...",4,3,,7,,,,,,,,,,,,,,,,,,,,,,0.583333


## BERT - Sentence Embedding

### sentence embedding for semantic score

In [12]:
def bert_tokenized(df):
  '''
  function to Tokenize all of the sentences and map the tokens to thier word IDs.
  '''
  input_ids = []
  segment_ids = []
  attention_masks = []
  for i in range(len(df['essay'])):
    encoded_dict = tokenizer.encode_plus( df['essay'][i],  # Sentence to encode.
                                         add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                         max_length=512,  # Pad & truncate all sentences.
                                         pad_to_max_length=True,
                                         return_attention_mask=True,  # Construct attn. masks.
                                         return_tensors='pt',  # Return pytorch tensors.
                                         )
    input_ids.append(encoded_dict['input_ids'])
    segment_ids.append(encoded_dict['token_type_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors (constructed ).
  input_ids = torch.cat(input_ids, dim=0)
  segment_ids = torch.cat(segment_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, segment_ids, attention_masks



def bert_model(input_id,attention_masks,segment_ids):

  '''
  funtion to run the text through BERT, and collect all of the hidden states produced from all 12 layers. 
  '''

  # Load pre-trained model (weights)
  model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True # Whether the model returns all hidden-states.
                                  )
  # Put the model in "evaluation" mode, meaning feed-forward operation, and turns off dropout regularization
  model.eval()

  with torch.no_grad():
    outputs = model(input_ids = input_id, attention_mask = attention_masks, token_type_ids = segment_ids)
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2] # The full set of hidden states for this model, stored in the object hidden_statesprint ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
    last_hidden_states = outputs[0]  # get the last hidden state
  
  #print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
  #layer_i = 0
  
  # the batch size, is used when submitting multiple sentences to the model at once
  #print ("Number of batches:", len(hidden_states[layer_i]))
  
  #batch_i = 0
  #print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
  #token_i = 0
  
  #print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
  
  # the output of last hidden states
  sentence_embedding = last_hidden_states[:,0,:]
  return sentence_embedding

In [13]:
input_ids_ss = bert_tokenized(train)[0]
segment_ids_ss = bert_tokenized(train)[1]
attention_masks_ss = bert_tokenized(train)[2]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# sentence embedding for semantic score
sentence_embedding_ss = bert_model(input_ids_ss,segment_ids_ss,attention_masks_ss)

### sentence embedding for prompt relevant score

In [20]:
train_ps=train.merge(prompt,on='essay_set',how='left')
train_ps.head(3)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,rater1_trait1,rater1_trait2,rater1_trait3,rater1_trait4,rater1_trait5,rater1_trait6,rater2_trait1,rater2_trait2,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,normalized_score,prompt
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,,,,,,,,,,,,,,,,,,,0.666667,"More and more people use computers, but not ev..."
1,2,1,Dear I believe that using computers will benef...,5,4,,9,,,,,,,,,,,,,,,,,,,,,,0.75,"More and more people use computers, but not ev..."
2,3,1,"Dear, More and more people use computers, but ...",4,3,,7,,,,,,,,,,,,,,,,,,,,,,0.583333,"More and more people use computers, but not ev..."


In [21]:
def generate_input_ids(df):
  encoded_dict = tokenizer.encode_plus(df["prompt"], df["essay"],
                                       max_length=512,  # Pad & truncate all sentences
                                       pad_to_max_length=True,
                                       add_special_tokens = True,
                                       # return_tensors='np'  # Return pytorch tensors.
                                       )
  return encoded_dict['input_ids']

def generate_attention_masks(df):
  encoded_dict = tokenizer.encode_plus(df["prompt"], df["essay"],
                                       max_length=512,  # Pad & truncate all sentences
                                       pad_to_max_length=True,
                                       add_special_tokens = True,
                                       # return_tensors='np'  # Return pytorch tensors.
                                       )
  return encoded_dict['attention_mask']

In [None]:
train_ps['input_ids_ps'] = sub1.apply(generate_input_ids, axis=1)
train_ps['attention_masks_ps'] = sub1.apply(generate_attention_masks, axis=1)

In [22]:
# Search the input_ids for the first instance of the `[SEP]` token.
def segment_ids(df):
  for input_id in sub1['input_ids']:
    sep_index = []
    sep_index.append(input_ids.index(tokenizer.sep_token_id))
    # The number of segment A tokens includes the [SEP] token istelf
    for index in sep_index:
      segment_ids = []
      num_seg_a = index + 1
      # The remainder are segment B
      num_seg_b = len(input_ids) - num_seg_a
      # Construct the list of 0s and 1s.
      segment_ids = [0]*num_seg_a + [1]*num_seg_b
  return segment_ids

## LSTM 

In [None]:
def lstm_model(batch_size, pochs, max_words,output_dim,max_phrase_len,input_ids, validation_split):

  # create the Keras model
  lstm = Sequential()
  lstm.add(Embedding(input_dim = max_words
                     ,output_dim = output_dim
                     ,input_length = max_phrase_len))
  lstm.add(SpatialDropout1D(0.3))
  lstm.add(LSTM(256, dropout = 0.3, recurrent_dropout = 0.3))
  lstm.add(Dense(256, activation = 'relu'))
  lstm.add(Dropout(0.3))
  
  # To finish off our network, we’ll add a standard fully-connected (Dense) layer 
  # and an output layer with sigmoid activation:     
  lstm.add(Dense(64, activation="relu"))
  lstm.add(Dense(1, activation="sigmoid"))
  
  # Compile the model: before training a model, to configure the learning process, which is done via the compile method. 
  lstm.compile(optimizer='rmsprop',
               loss='mse',
               metrics=['accuracy']) # for a mean squared error regression problem, https://faroit.com/keras-docs/1.2.0/getting-started/sequential-model-guide/
  
  print(lstm.summary())
  
  history = lstm.fit(input_ids.numpy(),labels,
                     validation_split = validation_split,
                     epochs = epochs,
                     batch_size = batch_size)
  Score = lstm.predict(input_ids)
  return Score


## Handcrafted Features

In [None]:
def get_correct_and_incorrect_spelling(df):
  """
  Function that measures lexical diversity which is the ratio of total words to unique words
  """
  # Load spaCy model
  nlp = spacy.load('en_core_web_sm')
  # load the enchant dictionary
  d = enchant.Dict("en_US")
  
  incorrect_spelling = []
  correct_spelling = []
  
  for essay in df['essay']:
    individual = []
    for word in essay.split():
      individual.append(d.check(word))
    
    output.append((individual.count(False), individual.count(True)))
  return output


def get_lexical_diversity(df):
    """
    Function that measures lexical diversity which is
    The ratio of total words to unique words
    """
    diversity = []
    for essay in df['essay']:
      diversity.append(round(len(tk.word_tokenize(essay)) / float(len(set(tk.word_tokenize(essay)))), 2))
    return diversity


def get_list_of_number_of_pos(df):
    """
    Function that parses the essay for each words POS
    Returns tuples containg for now, nouns, verbs, adverbs and adjectives
    """
    pos = []
    
    for essay in df['essay']:
        parsed_essay = nlp(essay)
        token_pos = [token.pos_ for token in parsed_essay]
        
        pos.append((token_pos.count('NOUN'), token_pos.count('VERB'), token_pos.count('ADV'), token_pos.count('ADJ')))
        
    return pos