In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import random
from sklearn.model_selection import train_test_split

In [2]:
MAX_WORD_PER_SENTENCE = 40
MAX_SENTENCE_PER_DOC = 40
MIN_FREQ_WORD_NUM = 5


In [3]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer

In [4]:
df=pd.read_csv('processed_dataset.csv')

In [5]:
[len(df[df['essay_set']==i]) for i in range(1,9)]

[1783, 1800, 1726, 1770, 1805, 1800, 1569, 723]

In [6]:
test=df[df['essay_set']==1]
test=test.drop(['essay_id','essay_set'],axis=1)
data=test['essay']

labels=test['score']
labels_np=labels.values
np.save('labels1.npy',labels_np)
type(labels)

pandas.core.series.Series

In [7]:
data_np=data.values
np.save('data1.npy',data_np)
len(data)

1783

In [8]:
sent_tokenizer = PunktSentenceTokenizer()
documents = []
for i,string in enumerate(data_np):
    string = re.sub(r"['`]", "", string)
    string = re.sub(r"[^A-Za-z(),!?.]", " ", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    string = string.strip().lower()
    documents.append(np.array(string))


In [9]:
def read(docs):
    sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
    documents = []
    for doc in docs:
        sentences = sent_tokenizer.tokenize(doc)
        documents.append(sentences)
    return documents




In [10]:
def split_to_words(documents):
    new_documents = []
    counter = Counter()
    drop=[]
    for i, doc in enumerate(documents):
        document = []
        
        discard = False
        for sentence in doc:
            n_sentence = []
            words = clean_str(sentence).split(" ")
            # if any sentence's length is over  MAX_WORD_PER_SENTENCE,
            # discard the whole document for simplicity
            if len(words) > MAX_WORD_PER_SENTENCE:
                discard = True
                break
            for word in words:
                word = word.strip()
                if word:
                    n_sentence.append(word)
                    counter[word] += 1
            if n_sentence:
                document.append(n_sentence)
        # only accept document that has more than one sentence and less than MAX_SENTENCE_PER_DOC,
        # again, for simplicity's sake
        if 1 < len(document) <= MAX_SENTENCE_PER_DOC and not discard:
            new_documents.append(document)
        else:
            drop.append(i)
    labels.drop(drop,inplace=True)
    return new_documents, counter


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z,!?'`]", " ", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()



In [11]:
def freq(n):
        num = 0
        for k, v in counter.items():
            if v >= n:
                num += 1
        return num
#print('number of vocabulary：%s' % len(counter))
#print('number of frequency more than %d：%s' % (5, freq(5)))


In [12]:
def process_doc(docs_processed,counter):
        for doc_id in range(len(docs_processed)):
            for sen_id in range(len(docs_processed[doc_id])):
                for word_id in range(len(docs_processed[doc_id][sen_id])):
                    word = docs_processed[doc_id][sen_id][word_id]
                    if counter[word] < 10:
                        docs_processed[doc_id][sen_id][word_id] = '__UNK_WORD__'


In [13]:
def write_doc(pos_docs, vocab, filename):
    docs = [(1, doc) for doc in pos_docs]
    len_to_data = {}
    for doc in docs:
        doc_len = len(doc[1])
        if doc_len in len_to_data:
            len_to_data[doc_len].append(doc)
        else:
            len_to_data[doc_len] = [doc]
    for value in len_to_data.values():
        random.shuffle(value)
    keys = list(len_to_data.keys())
    sorted_docs = []
    for key in sorted(keys):
        sorted_docs.extend(len_to_data[key])
    with open(filename, 'w') as f:
        for content in sorted_docs:
            line = '%d:' % content[0]
            for sentence in content[1]:
                sentence = [str(vocab[word]) for word in sentence]
                line += ','.join(sentence) + '#'
            f.write(line[:-1]+'\n')
        f.flush()


def write_vocab(vocab, vocab_file):
    with open(vocab_file, 'w') as f:
        for word, index in vocab.items():
            f.write(word+' '+str(index)+'\n')


In [14]:
def pre_process(docs):
    pos = read(docs)
    
    pos_processed, counter = split_to_words(pos)
    process_doc(pos_processed, counter)
    word_index = 1
    vocab = {}
    for doc in pos_processed:
        for sen in doc:
            for word in sen:
                if word not in vocab:
                    vocab[word] = word_index
                    word_index += 1

    all_docs = pos_processed
    doc_len = []
    sentence_len = []
    for doc in all_docs:
        doc_len.append(len(doc))
        for sen in doc:
            sentence_len.append(len(sen))
    print('total number of documents: %s, pos: %s' %
          (len(all_docs), len(pos_processed)))
    print('max num of document sentences：%s' % max(doc_len))
    print('min num of document sentences：%s' % min(doc_len))
    print('avg num of document sentences：%s' % (float(sum(doc_len))/len(doc_len)))

    print('max num of sentence words：%s' % max(sentence_len))
    print('min num of sentence words：%s' % min(sentence_len))
    print('avg num of sentence words：%s' % (float(sum(sentence_len))/len(sentence_len)))
    
    write_doc(pos_processed, vocab, 'data.dat')
    write_vocab(vocab, 'vocab.txt')
    return all_docs, vocab

all_data, vocab = pre_process(data_np)


total number of documents: 1056, pos: 1056
max num of document sentences：40
min num of document sentences：2
avg num of document sentences：24.107954545454547
max num of sentence words：40
min num of sentence words：1
avg num of sentence words：15.265260428941787


In [15]:
def encode_doc(lst, vocab):
    sentence_max_len = max([max([len(sen) for sen in doc]) for doc in lst])
    sentence_max_num = max(map(len, lst))
    result = np.zeros([len(lst), sentence_max_num, sentence_max_len], dtype=np.int32)
    for i, row in enumerate(lst):
        for j, col in enumerate(row):
            for k, val in enumerate(col):
                result[i][j][k] = vocab[val]
    return result

In [16]:
encoded=encode_doc(all_data,vocab)

In [17]:
np.count_nonzero(encoded)/(1056*40*40)

0.2300088778409091

In [21]:
np_labels = np.array(labels)
np_labels /= np.max(np_labels)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(encoded, np_labels, test_size=0.1, random_state=42)
np.save('x_train1',X_train)
np.save('y_train1',y_train)
np.save('x_test1',X_test)
np.save('y_test1',y_test)