# ASAP preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import random
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import PunktSentenceTokenizer
import json

Initialize constants. Documents with num of words in sentence > MAX_WORD_PER_SENTENCE or num of sentences in doc > MAX_SENTENCE_PER_DOC will be discarded. Words that appear in data less than MIN_FREQ_WORD_NUM wiil be changed to '__UNK_WORD__' token

In [2]:
MAX_WORD_PER_SENTENCE = 50
MAX_SENTENCE_PER_DOC = 50
MIN_FREQ_WORD_NUM = 5


Reading dataset

In [3]:
df=pd.read_csv('processed_dataset.csv')
df.drop(df[df['essay_set']==9].index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
[len(df[df['essay_set']==i]) for i in range(1,9)]

[1783, 1800, 1726, 1770, 1805, 1800, 1569, 723]

Rescaling all scores to 0-60 scale

In [5]:
for i in range(1,9):
    temp = df.loc[df["essay_set"]==i,"score"]
    df.loc[df["essay_set"]==i,"score"] = 60*(temp-np.min(temp))/(np.max(temp)-np.min(temp))

Dividing dataset to labels and documents

In [6]:
data=df['essay']
labels=df['score']
data_np = data.values

## Preprocessing functions

In [7]:
def read(docs):
    """
    Tokenize texts into sentences using nltk.
    
    docs: ndarray with texts, shape=(num_texts,)
    
    returns: documents - list with tokenized texts 
    """
    sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
    documents = []
    for doc in docs:
        sentences = sent_tokenizer.tokenize(doc)
        documents.append(sentences)
    return documents


In [8]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z,!?'`]", " ", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [9]:
def split_to_words(documents):
    """
    Tokenize sentence to words/removing text due to predefined constants.
    
    documents: list with tokenized texts (from read function)
    
    returns: new_documents - list with texts tokenized to words
             counter - Counter with frequencies of words
    """
    new_documents = []
    counter = Counter()
    drop=[]
    for i, doc in enumerate(documents):
        document = []
        
        discard = False
        for sentence in doc:
            n_sentence = []
            words = clean_str(sentence).split(" ")
            # if any sentence's length is over  MAX_WORD_PER_SENTENCE,
            # discard the whole document for simplicity
            if len(words) > MAX_WORD_PER_SENTENCE:
                discard = True
                break
            for word in words:
                word = word.strip()
                if word:
                    n_sentence.append(word)
                    counter[word] += 1
            if n_sentence:
                document.append(n_sentence)
        # only accept document that has more than one sentence and less than MAX_SENTENCE_PER_DOC,
        # again, for simplicity's sake
        if 1 < len(document) <= MAX_SENTENCE_PER_DOC and not discard:
            new_documents.append(document)
        else:
            drop.append(i)
    labels.drop(drop,inplace=True)
    return new_documents, counter





In [10]:
def freq(n):
        num = 0
        for k, v in counter.items():
            if v >= n:
                num += 1
        return num
#print('len of vocabulary：%s' % len(counter))
#print('number of frequency more than %d：%s' % (5, freq(5)))


In [11]:
def process_doc(docs_processed,counter):
    """
    Replacing rare words with '__UNK_WORD__' token.
    
    docs_preprocessed: list with texts tokenized to words (from split_to_word function)
    counter - Counter with frequencies of words (from split_to_word function)
    
    """
    for doc_id in range(len(docs_processed)):
        for sen_id in range(len(docs_processed[doc_id])):
            for word_id in range(len(docs_processed[doc_id][sen_id])):
                word = docs_processed[doc_id][sen_id][word_id]
                if counter[word] < 10:
                    docs_processed[doc_id][sen_id][word_id] = '__UNK_WORD__'


In [12]:
def write_vocab(vocab, vocab_file):
    """
    Write vocabulary to file.
    
    vocab: dict (word : word_code)
    vocab_file: string, filename
    """
    with open(vocab_file, 'w') as f:
        for word, index in vocab.items():
            f.write(word+' '+str(index)+'\n')


In [13]:
def pre_process(docs):
    """
    Preprocess dataset/save vocab.
    
    docs: ndarray with texts, shape=(num_texts,)
    
    returns: data_processed - preprocessed dataset
             vocab - vocabulary of all word in preprocessed dataset
    """
    data = read(docs)
    data_processed, counter = split_to_words(data)
    process_doc(data_processed, counter)
    word_index = 0
    vocab = {}
    for doc in data_processed:
        for sen in doc:
            for word in sen:
                if word not in vocab:
                    vocab[word] = word_index
                    word_index += 1

    doc_len = []
    sentence_len = []
    for doc in data_processed:
        doc_len.append(len(doc))
        for sen in doc:
            sentence_len.append(len(sen))
    print('total number of documents: %s' % (len(data_processed)))
    print('max num of document sentences：%s' % max(doc_len))
    print('min num of document sentences：%s' % min(doc_len))
    print('avg num of document sentences：%s' % (float(sum(doc_len))/len(doc_len)))

    print('max num of sentence words：%s' % max(sentence_len))
    print('min num of sentence words：%s' % min(sentence_len))
    print('avg num of sentence words：%s' % (float(sum(sentence_len))/len(sentence_len)))
    
    print('vocab len：%s' % len(vocab))
    
    write_vocab(vocab, 'vocab.txt')
    return data_processed, vocab

all_data, vocab = pre_process(data_np)


total number of documents: 10447
max num of document sentences：50
min num of document sentences：2
avg num of document sentences：12.821479850674836
max num of sentence words：50
min num of sentence words：1
avg num of sentence words：17.159191017275617
vocab len：6068


Encoding every word with unique code

In [14]:
def encode_doc(lst, vocab):
    """
    Encode every word in dataset with its code & pad all texts with zeros.
    
    lst: preprocessed dataset
    vocab - vocabulary of all word in preprocessed dataset
    
    returns: ndarray with encoded & padded data, shape=(num_docs, sentence_max_num, sentence_max_len) 
    """
    sentence_max_len = max([max([len(sen) for sen in doc]) for doc in lst])
    sentence_max_num = max(map(len, lst))
    result = np.zeros([len(lst), sentence_max_num, sentence_max_len], dtype=np.int32)
    for i, row in enumerate(lst):
        for j, col in enumerate(row):
            for k, val in enumerate(col):
                result[i][j][k] = vocab[val]
    return result

In [15]:
encoded=encode_doc(all_data,vocab)

Normalizing labels

In [16]:
np_labels = np.array(labels)
np_labels = (np_labels-np.min(np_labels))/(np.max(np_labels)-np.min(np_labels))

Split and saving

In [17]:
X_train, X_test, y_train, y_test = train_test_split(encoded, np_labels, test_size=0.1, random_state=42)
np.save('x_train',X_train)
np.save('y_train',y_train)
np.save('x_test',X_test)
np.save('y_test',y_test)


## Adding hyperparameters to json file

In [18]:
with open("../configs/config.json", "r") as jsonFile:
    config = json.load(jsonFile)

config["max_sent"] = MAX_SENTENCE_PER_DOC
config["max_word"] = MAX_WORD_PER_SENTENCE
config["vocab_size"] = len(vocab)
config["min_rating"] = 0
config["max_rating"] = 60


with open("../configs/config.json", "w") as jsonFile:
    json.dump(config, jsonFile, indent=4, separators=(',', ': '))